In [187]:
# Dependencies
import os
import pandas as pd
from bs4 import BeautifulSoup as bs
import pymongo as pm

# Extract

In [188]:
# During data extraction, raw data is copied or exported from source locations

### Speed Dating Data


In [189]:
# Reading the csv file
csv_file = "resources/speed-dating.csv"
date_speed_df = pd.read_csv(csv_file)
date_speed_df.head()

Unnamed: 0,has_null,wave,gender,age,age_o,d_age,d_d_age,race,race_o,samerace,...,d_expected_num_interested_in_me,d_expected_num_matches,like,guess_prob_liked,d_like,d_guess_prob_liked,met,decision,decision_o,match
0,0,1,female,21.0,27.0,6,[4-6],asian/pacific islander/asian-american,european/caucasian-american,0,...,[0-3],[3-5],7.0,6.0,[6-8],[5-6],0.0,1,0,0
1,0,1,female,21.0,22.0,1,[0-1],asian/pacific islander/asian-american,european/caucasian-american,0,...,[0-3],[3-5],7.0,5.0,[6-8],[5-6],1.0,1,0,0
2,1,1,female,21.0,22.0,1,[0-1],asian/pacific islander/asian-american,asian/pacific islander/asian-american,1,...,[0-3],[3-5],7.0,,[6-8],[0-4],1.0,1,1,1
3,0,1,female,21.0,23.0,2,[2-3],asian/pacific islander/asian-american,european/caucasian-american,0,...,[0-3],[3-5],7.0,6.0,[6-8],[5-6],0.0,1,1,1
4,0,1,female,21.0,24.0,3,[2-3],asian/pacific islander/asian-american,latino/hispanic american,0,...,[0-3],[3-5],6.0,6.0,[6-8],[5-6],0.0,1,1,1


### Reviews Data

In [190]:
reviews_df = pd.read_csv('resources/reviews.csv')
reviews_df.head()

Unnamed: 0.1,Unnamed: 0,Name,Review,Rating,#ThumbsUp,Date&Time,App
0,0,linah sibanda,On this app i cant find a partner,5,0,18-02-2022 01:19,Tinder
1,1,Norman Johnson,Tinder would be so much better if we could spe...,3,0,18-02-2022 01:16,Tinder
2,2,David Hume,Still doesn't correctly notify matches or mess...,1,0,18-02-2022 01:11,Tinder
3,3,Last 1 Standing,"Got banned because I updated my bio to say ""I ...",2,0,18-02-2022 01:11,Tinder
4,4,Arthur Magamedov,Love it!,5,0,18-02-2022 01:06,Tinder


### Data from website to be scrapped

In [191]:
# Read HTML from file
filepath = os.path.join("resources/template.html")
with open(filepath) as file:
    html = file.read()

In [192]:
# Create a Beautiful Soup Object
soup = bs(html, 'html.parser')

In [193]:
# Extract dating apps names and amount of their users from headlines
apps_and_users = soup.body.find_all('h2')[:24]
for t in apps_and_users:
    print(t.text)

1. Badoo — 480M Users
2. Tinder — 300M Users
3. Plenty of Fish — 150M Users
4. Bumble — 100 M Users
5. Adult Friend Finder — 99M Users
6. MenNation — 99M Users
7. Match — 96M Users
8. Ashley Madison — 65M Users
9. Happn — 50M Users
10. Zoosk — 40M Users
11. eharmony — 37M Users
12. OkCupid — 30M Users
13. Grindr — 27M Users
14. Coffee Meets Bagel — 21M Users
15. ChristianMingle — 16M Users
16. BeNaughty — 13.3M Users
17. EliteSingles — 13M Users
18. OurTime — 8.9M Users
19. BlackPeopleMeet — 5.7M Users
20. Hinge — 5.5M Users
21. HER — 4M Users
22. SilverSingles — 800,000 Users
23. LesbianPersonals — 500,000 Users
24. FriendFinder-X — 40,000 Users


# Transform
### Data cleansing and transformation

### Speed Dating Data

In [194]:
# Selecting relevant columns for future analysis
shortdata_df = date_speed_df [['gender', 'age', 'race','importance_same_race',
                               'field','importance_same_religion','attractive_o','sinsere_o','intelligence_o',
                               'funny_o','ambitous_o','shared_interests_o']].copy()
shortdata_df.head()

Unnamed: 0,gender,age,race,importance_same_race,field,importance_same_religion,attractive_o,sinsere_o,intelligence_o,funny_o,ambitous_o,shared_interests_o
0,female,21.0,asian/pacific islander/asian-american,2.0,law,4.0,6.0,8.0,8.0,8.0,8.0,6.0
1,female,21.0,asian/pacific islander/asian-american,2.0,law,4.0,7.0,8.0,10.0,7.0,7.0,5.0
2,female,21.0,asian/pacific islander/asian-american,2.0,law,4.0,10.0,10.0,10.0,10.0,10.0,10.0
3,female,21.0,asian/pacific islander/asian-american,2.0,law,4.0,7.0,8.0,9.0,8.0,9.0,8.0
4,female,21.0,asian/pacific islander/asian-american,2.0,law,4.0,8.0,7.0,9.0,6.0,9.0,7.0


In [195]:
# Retrieve pattern base on the field
field_df = shortdata_df.loc[shortdata_df["importance_same_race"] >=10].groupby(["field"])["attractive_o"].count()
field_df

field
american studies                           9
business                                  19
clinical psychology                       17
ecology                                   20
education                                 17
education policy                          14
finance                                    5
higher ed. - m.a.                         18
international finance; economic policy    10
law                                       18
mba                                        9
psychology                                 9
social work                               78
Name: attractive_o, dtype: int64

In [196]:
# Counting the number of rows
len(shortdata_df)

8378

In [197]:
# Dropping all empty cells
reduced_df = shortdata_df.dropna()
reduced_df.reset_index(inplace=True)
reduced_df= reduced_df.drop('index', axis=1)
reduced_df

Unnamed: 0,gender,age,race,importance_same_race,field,importance_same_religion,attractive_o,sinsere_o,intelligence_o,funny_o,ambitous_o,shared_interests_o
0,female,21.0,asian/pacific islander/asian-american,2.0,law,4.0,6.0,8.0,8.0,8.0,8.0,6.0
1,female,21.0,asian/pacific islander/asian-american,2.0,law,4.0,7.0,8.0,10.0,7.0,7.0,5.0
2,female,21.0,asian/pacific islander/asian-american,2.0,law,4.0,10.0,10.0,10.0,10.0,10.0,10.0
3,female,21.0,asian/pacific islander/asian-american,2.0,law,4.0,7.0,8.0,9.0,8.0,9.0,8.0
4,female,21.0,asian/pacific islander/asian-american,2.0,law,4.0,8.0,7.0,9.0,6.0,9.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...
6940,male,25.0,european/caucasian-american,1.0,climate dynamics,1.0,10.0,5.0,3.0,2.0,6.0,5.0
6941,male,25.0,european/caucasian-american,1.0,climate dynamics,1.0,6.0,3.0,7.0,3.0,7.0,2.0
6942,male,25.0,european/caucasian-american,1.0,climate dynamics,1.0,2.0,1.0,2.0,2.0,2.0,1.0
6943,male,25.0,european/caucasian-american,1.0,climate dynamics,1.0,5.0,7.0,5.0,5.0,3.0,6.0


In [198]:
# Renaming the columns for better understanding
renaming_df = reduced_df.rename(columns={"importance_same_race":"Same race",
                                         "importance_same_religion":"Same religion",
                                         "attractive_o":"Attractiveness",
                                         "sinsere_o":"Sincerity",
                                         "intelligence_o":"Intelligence",
                                         "gender":"Gender",
                                         "age":"Age",
                                         "race":"Race",
                                         "field":"Field",
                                         "funny_o":"Funny",
                                         "ambitous_o":"Ambition",
                                         "shared_interests_o":"Shared interests"})
renaming_df.tail()

Unnamed: 0,Gender,Age,Race,Same race,Field,Same religion,Attractiveness,Sincerity,Intelligence,Funny,Ambition,Shared interests
6940,male,25.0,european/caucasian-american,1.0,climate dynamics,1.0,10.0,5.0,3.0,2.0,6.0,5.0
6941,male,25.0,european/caucasian-american,1.0,climate dynamics,1.0,6.0,3.0,7.0,3.0,7.0,2.0
6942,male,25.0,european/caucasian-american,1.0,climate dynamics,1.0,2.0,1.0,2.0,2.0,2.0,1.0
6943,male,25.0,european/caucasian-american,1.0,climate dynamics,1.0,5.0,7.0,5.0,5.0,3.0,6.0
6944,male,25.0,european/caucasian-american,1.0,climate dynamics,1.0,8.0,8.0,7.0,7.0,7.0,7.0


### Reviews Data

### Data from https://www.datingadvice.com/online-dating/dating-sites-with-the-most-users

In [199]:
# Storing the data in a dataframe
data = {'App':['Badoo', 'Tinder', 'Plenty of Fish', 'Bumble', 'Adult Friend Finder','MenNation','Match','Ashley Madison','Happn','Zoosk','eharmony','OkCupid','Grindr','Coffee Meets Bagel','ChristianMingle','BeNaughty','EliteSingles','OurTime','BlackPeopleMeet','Hinge','HER','SilverSingles','LesbianPersonals','FriendFinder-X'], 'Users in million':[480, 300, 150, 100, 99, 99, 96, 65, 50, 40, 37, 30, 27, 21, 16, 13.3, 13, 8.9, 5.7, 5.5, 4, 0.8, 0.5, 0.04]}

# Create DataFrame
data_df = pd.DataFrame(data)
data_df

Unnamed: 0,App,Users in million
0,Badoo,480.0
1,Tinder,300.0
2,Plenty of Fish,150.0
3,Bumble,100.0
4,Adult Friend Finder,99.0
5,MenNation,99.0
6,Match,96.0
7,Ashley Madison,65.0
8,Happn,50.0
9,Zoosk,40.0


In [200]:
# Exporting data to CSV file
data_df.to_csv('resources/apps_and_users.csv', index = False)

# Load
### Exporting the data into MongoDB

In [201]:
# The default port used by MongoDB is 27017
conn = 'mongodb://localhost:27017'
client = pm.MongoClient(conn)

In [202]:
# Creating the database and the collection
db = client.DatingDB
collection = db.love_finder

### Speed Dating Data

In [203]:
# Converting the dataframe into a dictionary
renaming_df.reset_index(drop=True)
data_dict = renaming_df.to_dict("records")
data_dict

[{'Gender': 'female',
  'Age': 21.0,
  'Race': 'asian/pacific islander/asian-american',
  'Same race': 2.0,
  'Field': 'law',
  'Same religion': 4.0,
  'Attractiveness': 6.0,
  'Sincerity': 8.0,
  'Intelligence': 8.0,
  'Funny': 8.0,
  'Ambition': 8.0,
  'Shared interests': 6.0},
 {'Gender': 'female',
  'Age': 21.0,
  'Race': 'asian/pacific islander/asian-american',
  'Same race': 2.0,
  'Field': 'law',
  'Same religion': 4.0,
  'Attractiveness': 7.0,
  'Sincerity': 8.0,
  'Intelligence': 10.0,
  'Funny': 7.0,
  'Ambition': 7.0,
  'Shared interests': 5.0},
 {'Gender': 'female',
  'Age': 21.0,
  'Race': 'asian/pacific islander/asian-american',
  'Same race': 2.0,
  'Field': 'law',
  'Same religion': 4.0,
  'Attractiveness': 10.0,
  'Sincerity': 10.0,
  'Intelligence': 10.0,
  'Funny': 10.0,
  'Ambition': 10.0,
  'Shared interests': 10.0},
 {'Gender': 'female',
  'Age': 21.0,
  'Race': 'asian/pacific islander/asian-american',
  'Same race': 2.0,
  'Field': 'law',
  'Same religion': 4.0,


### Reviews Data

### Data from https://www.datingadvice.com/online-dating/dating-sites-with-the-most-users