In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px


dataset = pd.read_csv("Speed_Dating_Data.csv",encoding = 'ISO-8859-1')


# Basic stats
print("Number of rows : {}".format(dataset.shape[0]))
print()
print("Display of dataset: ")
display(dataset.head())
print()

print("Basics statistics: ")
data_desc = dataset.describe(include='all')
display(data_desc)
print()

print("Percentage of missing values: ")
a =dataset.isnull().sum()/len(dataset)*100
a.sort_values(ascending=False)

missing_values = pd.DataFrame( data= a)

#we choose to remove the columns where we have more than 20% of missing values:
columns_to_remove=[]
import numpy as np
for (row , i) in missing_values.itertuples():
    if i > 20.00 :
        columns_to_remove.append(row)
columns_to_remove

dataset = dataset.drop(labels = columns_to_remove, axis=1)

#we check again the percentage of missing values:
print("Percentage of missing values: ")
a =dataset.isnull().sum()/len(dataset)*100
a.sort_values(ascending=False)

#we are going to fill the missing values ,
#we are looking for the rows where we have missing values:
to_fill = dataset.isnull().any()
to_fill = pd.DataFrame(to_fill)

#we make a list of columns where missing value in rows = True:
columns_to_fill=[]

for (row , i) in to_fill.itertuples():
    if i == True :
        columns_to_fill.append(row)
columns_to_fill


#for the numerical values we are going to fill the missing values by the median:
dataset = dataset.fillna(dataset[columns_to_fill].median())

#the remaining missing values are categorical:
print("Percentage of missing values: ")
a =dataset.isnull().sum()/len(dataset)*100
a.sort_values(ascending=False)

#we are replacing the missing values by 'the mode' (the most frequent value in same column) :
dataset['career'].fillna(dataset['career'].mode()[0], inplace=True)
dataset['field'].fillna(dataset['field'].mode()[0], inplace=True)

#we remove the useless or repeated columns (to avoid colinearity):
dataset = dataset.drop(labels = ["id","career_c","field_cd","career_c","partner",'met','zipcode','from'], axis=1)

#we are going to find the outliers in the dataset(minima & maxima):
max_values = dataset.max()
max_values =pd.DataFrame(max_values,columns=['values_max'])
max_values['name']=max_values.index
max_values = max_values.reset_index(drop=True)

min_values = dataset.min()
min_values =pd.DataFrame(min_values,columns=['values_min'])
min_values['name']=min_values.index
min_values = min_values.reset_index(drop=True)

#we merge the 2 df in 1 to find for each columns the outliers:
min_max_dataset = max_values.merge(min_values, on='name')
min_max_dataset


#in the below columns there are marks higher than '10', we remove them:
dataset= dataset[dataset['attr_o'] <= 10]
dataset= dataset[dataset['fun_o'] <= 10]
dataset= dataset[dataset['gaming'] <= 10]
dataset= dataset[dataset['reading'] <= 10]

#we replace "?" par 'unknown' in the 'career' column:
dataset['career'] = dataset['career'].replace(["?"],['unknown'])

#we plot the age and age of partner to show the distribution of age per gender:

ax = sns.catplot(x="gender", y="age", kind ='violin',data=dataset).set(title='Distribution of Age per Gender')

ax = sns.catplot(x="gender", y="age_o", kind ='violin',data=dataset).set(title='Distribution of Age of partner per Gender')

#As a result we delete rows with age > 55 as outliers:
dataset = dataset[dataset['age']<55]
#we delete rows with age > 55 as outliers:
dataset = dataset[dataset['age_o']<55]

#we want to make to data more "readable" so we use a lambda function :
dataset['race_o'] = dataset['race_o'].apply(lambda x: "AfroAm" if x == 1
                                                                                   else 'CaucasianAm' if x == 2
                                                                                     else 'LatinoAm' if x == 3
                                                                                      else 'AsianAm/ Pacific IslanderAm' if x == 4
                                                                                        else 'NativeAm' if x == 5
                                                                                           else 'Other' if x == 6
                                                                                               else 'Unknown')


dataset['race'] = dataset['race'].apply(lambda x: "AfroAm" if x == 1
                                                                                   else 'CaucasianAm' if x == 2
                                                                                     else 'LatinoAm' if x == 3
                                                                                      else 'AsianAm/ Pacific IslanderAm' if x == 4
                                                                                        else 'NativeAm' if x == 5
                                                                                           else 'Other' if x == 6
                                                                                               else 'Unknown')

dataset['samerace'] = dataset['samerace'].replace([0,1],['No', 'Yes'])
dataset['samerace']

dataset['match'] = dataset['match'].replace([0, 1], ['No', 'Yes'])

dataset['dec'] = dataset['dec'].replace([0, 1], ['No', 'Yes'])
dataset['dec_o'] = dataset['dec_o'].replace([0, 1], ['No', 'Yes'])

dataset['gender'] = dataset['gender'].apply(lambda x: "Female" if x == 0
                                                                    else "Male")



#let's figure out the repartition of 'race' on a pie chart:
race = dataset['race']
pie_chart_data = race.value_counts()

explode = (0.1,0.1,0.1,0.1,0.1)
plt.figure()
plt.pie(pie_chart_data.values, labels=pie_chart_data.index,  
       autopct='%1.1f%%',
       shadow=True, 
       startangle=90,
       explode=explode,
       radius=1.5
       )
plt.title('Répartition des "ethnies"', loc ='left', color ='white')
plt.legend(bbox_to_anchor=(1.1, 1.05))
plt.show()

#Repartition between male & female:
sns.countplot(dataset['gender']).set(title='Part of gender')

#Repartition between male & female by 'race':
plt.figure(figsize=[12,5])
sns.countplot(dataset['gender'], hue = dataset['race']).set(title='Part of gender per ethny')


#importance of 'race' by 'race and gender:
plt.figure(figsize=[7,5])
sns.barplot(x = 'imprace' , y = 'race' ,hue = 'gender', data = dataset).set(title="Importance of 'race' per gender & per ethny")
plt.legend(loc='upper right')

#Importance of race and meeting same race people:
sns.catplot(x='samerace',y='imprace',kind='violin',hue ='race',data=dataset).set(title='Importance of race when meeting same race people')

sns.catplot(x='samerace',y='imprelig',kind='box',hue='race', data=dataset).set(title='Importance of religion when meeting same race people')

#we filter the dataset on decision 'Yes' to see the decision of partner in fonction of 'race:
dataset[dataset.dec=='Yes'].groupby(['samerace','dec_o'])['samerace'].count().plot(kind='bar', title='Decision of partner if same race or not')

#Importance of race in fonction of age and "race"':
sns.relplot(x='age',y='imprace',hue='race', size='race',data=dataset).set(title='Importance of race in fonction of age and "race"')

#importance of race on decision of partner:
dataset[dataset.dec=='Yes'].groupby(['dec_o','imprace'])['dec_o'].count().plot.bar().set(title='Influence of importance of race on decision of partner')


# plotting the histogram of importance of 'race' by 'race':
fig = px.histogram(dataset, x="imprace", color='race',
                   nbins=50, histnorm='percent',
                   barmode='overlay', title='Répartition de l importance de l ethnie par ethnie')
fig.update_layout(
    xaxis = dict(
        tickmode = 'linear',
        tick0 = 0,
        dtick = 1))
# showing the plot
fig.show()


# plotting the boxplot 'Importance of race per gender and per race':
fig = px.box(dataset, x="gender", y="imprace",title = 'Importance of race per gender and per race',color ='race')
fig.update_layout(
    font_family="Courier New",
    font_color="blue",
    title_font_family="Times New Roman",
    title_font_color="red",
    legend_title_font_color="blue"
)
# showing the plot
fig.show()


# plotting the figure
fig = px.scatter_3d(dataset, x="race_o", y="race", z="dec_o", color='gender', 
                    size='imprace',width=800, height=400, title='Decision of partner regarding race & gender ')
fig.update_layout(
    font_family="Courier New",
    font_color="black",
    title_font_family="Times New Roman",
    title_font_color="blue",
    legend_title_font_color="black"
)
fig.show()