In [3]:
#importing dependencies
import pandas as pd
import pymongo
from pymongo import MongoClient

In [4]:
#reading the csv file
csv_file = "resources/speed-dating.csv"
date_speed_df = pd.read_csv(csv_file)
date_speed_df.head()

Unnamed: 0,has_null,wave,gender,age,age_o,d_age,d_d_age,race,race_o,samerace,...,d_expected_num_interested_in_me,d_expected_num_matches,like,guess_prob_liked,d_like,d_guess_prob_liked,met,decision,decision_o,match
0,0,1,female,21.0,27.0,6,[4-6],asian/pacific islander/asian-american,european/caucasian-american,0,...,[0-3],[3-5],7.0,6.0,[6-8],[5-6],0.0,1,0,0
1,0,1,female,21.0,22.0,1,[0-1],asian/pacific islander/asian-american,european/caucasian-american,0,...,[0-3],[3-5],7.0,5.0,[6-8],[5-6],1.0,1,0,0
2,1,1,female,21.0,22.0,1,[0-1],asian/pacific islander/asian-american,asian/pacific islander/asian-american,1,...,[0-3],[3-5],7.0,,[6-8],[0-4],1.0,1,1,1
3,0,1,female,21.0,23.0,2,[2-3],asian/pacific islander/asian-american,european/caucasian-american,0,...,[0-3],[3-5],7.0,6.0,[6-8],[5-6],0.0,1,1,1
4,0,1,female,21.0,24.0,3,[2-3],asian/pacific islander/asian-american,latino/hispanic american,0,...,[0-3],[3-5],6.0,6.0,[6-8],[5-6],0.0,1,1,1


# Data cleansing and transformation

In [5]:
#selecting relevant columns for future analysis
shortdata_df = date_speed_df [['gender', 'age', 'race','importance_same_race',
                               'field','importance_same_religion','attractive_o','sinsere_o','intelligence_o',
                              'funny_o','ambitous_o','shared_interests_o']].copy()
shortdata_df.head()

Unnamed: 0,gender,age,race,importance_same_race,field,importance_same_religion,attractive_o,sinsere_o,intelligence_o,funny_o,ambitous_o,shared_interests_o
0,female,21.0,asian/pacific islander/asian-american,2.0,law,4.0,6.0,8.0,8.0,8.0,8.0,6.0
1,female,21.0,asian/pacific islander/asian-american,2.0,law,4.0,7.0,8.0,10.0,7.0,7.0,5.0
2,female,21.0,asian/pacific islander/asian-american,2.0,law,4.0,10.0,10.0,10.0,10.0,10.0,10.0
3,female,21.0,asian/pacific islander/asian-american,2.0,law,4.0,7.0,8.0,9.0,8.0,9.0,8.0
4,female,21.0,asian/pacific islander/asian-american,2.0,law,4.0,8.0,7.0,9.0,6.0,9.0,7.0


In [6]:
#retrieve pattern base on the field
field_df = shortdata_df.loc[shortdata_df["importance_same_race"] >=10].groupby(["field"])["attractive_o"].count()
field_df

field
american studies                           9
business                                  19
clinical psychology                       17
ecology                                   20
education                                 17
education policy                          14
finance                                    5
higher ed. - m.a.                         18
international finance; economic policy    10
law                                       18
mba                                        9
psychology                                 9
social work                               78
Name: attractive_o, dtype: int64

In [7]:
#counting the number of rows
len(shortdata_df)

8378

In [8]:
#dropping all empty cells
reduced_df = shortdata_df.dropna()
reduced_df.reset_index(inplace=True)
reduced_df= reduced_df.drop('index', axis=1)
reduced_df

Unnamed: 0,gender,age,race,importance_same_race,field,importance_same_religion,attractive_o,sinsere_o,intelligence_o,funny_o,ambitous_o,shared_interests_o
0,female,21.0,asian/pacific islander/asian-american,2.0,law,4.0,6.0,8.0,8.0,8.0,8.0,6.0
1,female,21.0,asian/pacific islander/asian-american,2.0,law,4.0,7.0,8.0,10.0,7.0,7.0,5.0
2,female,21.0,asian/pacific islander/asian-american,2.0,law,4.0,10.0,10.0,10.0,10.0,10.0,10.0
3,female,21.0,asian/pacific islander/asian-american,2.0,law,4.0,7.0,8.0,9.0,8.0,9.0,8.0
4,female,21.0,asian/pacific islander/asian-american,2.0,law,4.0,8.0,7.0,9.0,6.0,9.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...
6940,male,25.0,european/caucasian-american,1.0,climate dynamics,1.0,10.0,5.0,3.0,2.0,6.0,5.0
6941,male,25.0,european/caucasian-american,1.0,climate dynamics,1.0,6.0,3.0,7.0,3.0,7.0,2.0
6942,male,25.0,european/caucasian-american,1.0,climate dynamics,1.0,2.0,1.0,2.0,2.0,2.0,1.0
6943,male,25.0,european/caucasian-american,1.0,climate dynamics,1.0,5.0,7.0,5.0,5.0,3.0,6.0


In [9]:
#renaming the columns for better understanding
renaming_df = reduced_df.rename(columns={"importance_same_race":"Same race",
                         "importance_same_religion":"Same religion",
                         "attractive_o":"Attractiveness",
                         "sinsere_o":"Sincerity",
                         "intelligence_o":"Intelligence",
                         "gender":"Gender",
                         "age":"Age",
                         "race":"Race",
                         "field":"Field",                
                         "funny_o":"Funny",
                         "ambitous_o":"Ambition",
                         "shared_interests_o":"Shared interests"})
renaming_df.tail()

Unnamed: 0,Gender,Age,Race,Same race,Field,Same religion,Attractiveness,Sincerity,Intelligence,Funny,Ambition,Shared interests
6940,male,25.0,european/caucasian-american,1.0,climate dynamics,1.0,10.0,5.0,3.0,2.0,6.0,5.0
6941,male,25.0,european/caucasian-american,1.0,climate dynamics,1.0,6.0,3.0,7.0,3.0,7.0,2.0
6942,male,25.0,european/caucasian-american,1.0,climate dynamics,1.0,2.0,1.0,2.0,2.0,2.0,1.0
6943,male,25.0,european/caucasian-american,1.0,climate dynamics,1.0,5.0,7.0,5.0,5.0,3.0,6.0
6944,male,25.0,european/caucasian-american,1.0,climate dynamics,1.0,8.0,8.0,7.0,7.0,7.0,7.0


# Exporting the database in mongodb

In [10]:
# The default port used by MongoDB is 27017
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [11]:
#creating the database and the collection
db = client.DatingDB
collection = db.love_finder

In [12]:
#converting the dataframe into a dictionary
renaming_df.reset_index(drop=True)
data_dict = renaming_df.to_dict("records")
data_dict

[{'Gender': 'female',
  'Age': 21.0,
  'Race': 'asian/pacific islander/asian-american',
  'Same race': 2.0,
  'Field': 'law',
  'Same religion': 4.0,
  'Attractiveness': 6.0,
  'Sincerity': 8.0,
  'Intelligence': 8.0,
  'Funny': 8.0,
  'Ambition': 8.0,
  'Shared interests': 6.0},
 {'Gender': 'female',
  'Age': 21.0,
  'Race': 'asian/pacific islander/asian-american',
  'Same race': 2.0,
  'Field': 'law',
  'Same religion': 4.0,
  'Attractiveness': 7.0,
  'Sincerity': 8.0,
  'Intelligence': 10.0,
  'Funny': 7.0,
  'Ambition': 7.0,
  'Shared interests': 5.0},
 {'Gender': 'female',
  'Age': 21.0,
  'Race': 'asian/pacific islander/asian-american',
  'Same race': 2.0,
  'Field': 'law',
  'Same religion': 4.0,
  'Attractiveness': 10.0,
  'Sincerity': 10.0,
  'Intelligence': 10.0,
  'Funny': 10.0,
  'Ambition': 10.0,
  'Shared interests': 10.0},
 {'Gender': 'female',
  'Age': 21.0,
  'Race': 'asian/pacific islander/asian-american',
  'Same race': 2.0,
  'Field': 'law',
  'Same religion': 4.0,


In [13]:
#loading the data into the collection
collection.insert_many(data_dict)

<pymongo.results.InsertManyResult at 0x7faa30d41df0>

In [14]:
#displaying rows in mongoBD collection
documents = db.love_finder.find()

for document in documents:
    print(document)

{'_id': ObjectId('63038034217c2e73e8fdc891'), 'Gender': 'female', 'Age': 21.0, 'Race': 'asian/pacific islander/asian-american', 'Same race': 2.0, 'Field': 'law', 'Same religion': 4.0, 'Attractiveness': 6.0, 'Sincerity': 8.0, 'Intelligence': 8.0, 'Funny': 8.0, 'Ambition': 8.0, 'Shared interests': 6.0}
{'_id': ObjectId('63038034217c2e73e8fdc892'), 'Gender': 'female', 'Age': 21.0, 'Race': 'asian/pacific islander/asian-american', 'Same race': 2.0, 'Field': 'law', 'Same religion': 4.0, 'Attractiveness': 7.0, 'Sincerity': 8.0, 'Intelligence': 10.0, 'Funny': 7.0, 'Ambition': 7.0, 'Shared interests': 5.0}
{'_id': ObjectId('63038034217c2e73e8fdc893'), 'Gender': 'female', 'Age': 21.0, 'Race': 'asian/pacific islander/asian-american', 'Same race': 2.0, 'Field': 'law', 'Same religion': 4.0, 'Attractiveness': 10.0, 'Sincerity': 10.0, 'Intelligence': 10.0, 'Funny': 10.0, 'Ambition': 10.0, 'Shared interests': 10.0}
{'_id': ObjectId('63038034217c2e73e8fdc894'), 'Gender': 'female', 'Age': 21.0, 'Race': 

# Example of analysis to conduct on the data

In [15]:
#Counting the gender of each race group in the dataset
racecount=renaming_df.groupby(['Race'])['Gender'].count()
racecount

Race
asian/pacific islander/asian-american    1674
black/african american                    365
european/caucasian-american              3888
latino/hispanic american                  565
other                                     453
Name: Gender, dtype: int64

In [16]:
#retrieve pattern base on the field 
field_df = renaming_df.loc[renaming_df["Same religion"] >=10].groupby(["Field"])["Attractiveness"].count()
field_df

Field
business                             13
economics; sociology                  9
education                            16
education policy                     13
elementary education - preservice    13
international affairs                19
law                                  17
ma science education                 17
marine geophysics                     3
mathematics                          19
neuroscience and education           16
operations research                  14
political science                    16
psychology                           45
social work                          39
speech pathology                     13
Name: Attractiveness, dtype: int64