In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data=pd.read_csv('/kaggle/input/speed-dating-experiment/Speed Dating Data.csv', encoding= 'unicode_escape')

In [None]:
fields = data.columns

print('Total number of people that participated, assuming person does not appear in more than one wave: {}'.format(len(data['iid'].unique())))
print('Total number of dates occurred: {}'.format(len(data.index)))

In [None]:
data.head()

In [None]:
pd.options.display.max_columns = None

data.rename(columns={"gender": "Gender", 
                     "condtn":"Condition",
                     "mn_sat":"median_sat",
                     "age_o":"age of partner",
                     "race_o":"race of partner",
                     "pf_o_att":"partner stated preference",
                     "dec_o":"partner's decision",
                     "attr_o":"partner rating of attributes",
                     "imprace":"race importance",
                     "imprelig":"religion importance",
                     "from":"originally from",
                     "date":"frequency of date"}, inplace=True)

race_replacement={1:"Black/African American",
                 2: "European/Caucasion-American",
                 3: "Latino/Hispanic American",
                 4: "Asian/Pacific Islander/Asian-American",
                 5: "Native American",
                 6: "Other"}

field_replacement={1:"Law",
                   2:"Math",
                   3:"Social Science, Psychologist",
                   4:"Medican Science, Pharmaceuticals, and Bio Tech",
                   5:"Engineering",
                   6:"English/Creative Writing / Journalism",
                   7:"History/ Religion/ Philosophy",
                   8:"Business/Econ/Finance",
                   9:"Education, Academia",
                   10:"Biological Sciences / Chemistry/ Physics",
                   11:"Social Work",
                   12:"Undergrad/undecided",
                   13:"Political Science/ International Affairs",
                   14:"Film",
                   15:"Fine Arts / Arts Administration",
                   16:"Languages",
                   17:"Architecture",
                   18:"Other"}

career_replacement={1: "Lawyer",
                   2:  "Academic / Research",
                   3:  "Psychologist",
                   4:  "Doctor/Medicine",
                   5:  "Engineer",
                   6:  "Creative Arts/ Entertainment",
                   7:  "Banking / Consulting /Finance / Marketing/ Business /CEO/ Entrepreneur / Admin",
                   8:  "Real Estate",
                   9:  "International /Humanitarian Affairs",
                   10:  "Undecided",
                   11:  "Social Work",
                   12:  "Speech Pathology",
                   13:  "Politics",
                   14:  "Pro Sports / Athletics",
                   15:  "Other",
                   16:  "Journalism",
                   17:  "Architecture"}

decision_replacement={1:"Yes", 0:"No"}
length_replacement={1:"Too little",
                   2: "Too much",
                   3: "Just Right"}
goal_replacement={1:"Seemed like a fan night out",
                 2: "To meet new people",
                 3: "To get a date",
                 4:"Looking for serious relationship",
                 5:"To say I did it",
                 6: "Other"}
frequency_replacement={1:"Several times a week",
                      2: "Twice a week",
                      3: "Once a week",
                      4: "Twice a month",
                      5: "Once a month",
                      6: "Several times a year",
                      7: "Almost never"}



In [None]:
data.head()

In [None]:
data["Gender"].replace({0:"Female",1:"Male"}, inplace=True)
data["Condition"].replace({1:"Limited choice", 2:"Extensive choice"}, inplace=True)
data["match"].replace({1:"Yes",0:"No"}, inplace=True)
data["samerace"].replace({1:"Yes",0:"No"},inplace=True)
data["race of partner"].replace(race_replacement, inplace=True)
data["partner's decision"].replace(decision_replacement, inplace=True)
data["field_cd"].replace(field_replacement, inplace=True)
data["race"].replace(race_replacement, inplace=True)
data["career_c"].replace(career_replacement, inplace=True)
data["length"].replace(length_replacement, inplace=True)
data["numdat_2"].replace({1:"Too few", 2: "Too many", 3: "Just Right"}, inplace=True)
data["date_3"].replace({1:"Yes", 2:"No", 0:np.nan}, inplace=True)
data["goal"].replace(goal_replacement, inplace=True)
data["frequency of date"].replace(frequency_replacement, inplace=True)
data["go_out"].replace(frequency_replacement, inplace=True)

display(data)

In [None]:
def missing_values(df):
    missing=pd.DataFrame(df.isnull().sum()/len(data))*100
    missing.columns = ['missing_values(%)']
    missing['missing_values(numbers)'] = pd.DataFrame(df.isnull().sum())
    return missing.sort_values(by='missing_values(%)', ascending=False)
missing_values(data)

In [None]:
print(data.columns.tolist())

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(17,5))

# The number of dates per person
num_dates_per_male = data[data.Gender == 'Male'].groupby('iid').apply(len)
num_dates_per_female = data[data.Gender == 'Female'].groupby('iid').apply(len)
axes[0].hist(num_dates_per_male, bins=22, alpha=0.5, label='# dates per male')
axes[0].hist(num_dates_per_female, bins=22, alpha=0.5, label='# dates per female')
# axes[0].suptitle('Number of dates per male/female')
axes[0].legend(loc='upper right')

# The number of matches per person
matches = data[data.match == "Yes"]
matches_male = matches[matches.Gender == 'Male'].groupby('iid').apply(len)
matches_female = matches[matches.Gender == 'Female'].groupby('iid').apply(len)
axes[1].hist((matches_male / num_dates_per_male).dropna(), alpha=0.5, label='male match percentage')
axes[1].hist((matches_female / num_dates_per_female).dropna(), alpha=0.5, label='female match percentage')
axes[1].legend(loc='upper right')
# axes[1].suptitle('Matches per person by gender')

print('Avg. dates per male: {0:.1f}\t\tAvg. dates per female: {1:.1f}\nAvg. male match percentage: {2:.1f}\tAvg. female match percentage: {3:.1f}'.format(
        num_dates_per_male.mean(), 
        num_dates_per_female.mean(),
        (matches_male / num_dates_per_male).mean() * 100.0,
        (matches_female / num_dates_per_female).mean() * 100.0))

In [None]:
fig, (ax1, ax2)=plt.subplots(ncols=2, figsize=[12,8])
sns.countplot(data=data, hue ="Gender", x="dec", ax=ax1).set_title("Female and Male saying No and Yes");
sns.countplot(data=data, hue ="race", x="dec", ax=ax2).set_title("People with different ethnicity saying yes and no to their matches");

In [None]:
cor=data.corr(method="spearman")
corr_target=abs(cor["partner rating of attributes"])
corr_target[corr_target>=0.50]

In [None]:
sns.jointplot(y="partner rating of attributes", x="fun_o", data=data);

In [None]:
sns.jointplot(y="partner rating of attributes", x="like_o", data=data, );

Race importance

In [None]:
(
data
    .groupby(["race"])
    [["match", "iid"]]
   .count()
    .sort_values("match", ascending=False)
    .head(100)
).plot.pie(y="iid", autopct='%1.0f%%')

Position importance

In [None]:
(
data
    .groupby(["field", "wave"])
    [["match"]]
   .count()
    .sort_values("match", ascending=False)
    .head(100)
)

Race importance

In [None]:
(data
 .groupby("race importance")
 .size()
).plot.pie(y="iid",autopct='%1.0f%%')


[Call](http://)

In [None]:
(data
.groupby("them_cal")
 .count()
).plot.pie(y="iid", autopct='%1.0f%%')

Conclusion:

* 50% of participants did not receive calls from their matchers, 27% received only 1 call
* 62% of participants did not get any dates from the experiments

* Those who go out frequently got more matches
* 25% of people who go on a date twice a month, or several times a year got more matches
* People who had a goal to have a fan night out and meet new people got the most of the matches
* Religion and Race are not so important for matching