In [None]:
import pandas as pd
import numpy as np
import plotly.express as px

In [None]:
dataset = pd.read_csv("../data/speed_dating_data_unprocessed.csv", encoding='ISO-8859-1')
display(dataset.head())

Unnamed: 0,iid,id,gender,idg,condtn,wave,round,position,positin1,order,...,attr3_3,sinc3_3,intel3_3,fun3_3,amb3_3,attr5_3,sinc5_3,intel5_3,fun5_3,amb5_3
0,1,1.0,0,1,1,1,10,7,,4,...,5.0,7.0,7.0,7.0,7.0,,,,,
1,1,1.0,0,1,1,1,10,7,,3,...,5.0,7.0,7.0,7.0,7.0,,,,,
2,1,1.0,0,1,1,1,10,7,,10,...,5.0,7.0,7.0,7.0,7.0,,,,,
3,1,1.0,0,1,1,1,10,7,,5,...,5.0,7.0,7.0,7.0,7.0,,,,,
4,1,1.0,0,1,1,1,10,7,,7,...,5.0,7.0,7.0,7.0,7.0,,,,,


---

## Which attributes do participants place most importance on when deciding to see a partner again?

In [21]:
columns_to_keep = ["dec", "attr", "sinc", "intel", "fun", "amb", "shar"]
df_attributes = dataset[columns_to_keep].copy()

print("Missing values ratio (%) : ")
print(100 * df_attributes.isnull().sum() / df_attributes.shape[0])

Missing values ratio (%) : 
dec       0.000000
attr      2.411077
sinc      3.306278
intel     3.533063
fun       4.177608
amb       8.498448
shar     12.735736
dtype: float64


In [22]:
df_attributes = df_attributes.transform(lambda col: col.fillna(col.median()))

df_attributes["dec"] = df_attributes["dec"].map({0: "No", 1: "Yes"})

display(df_attributes.head())

Unnamed: 0,dec,attr,sinc,intel,fun,amb,shar
0,Yes,6.0,9.0,7.0,7.0,6.0,5.0
1,Yes,7.0,8.0,7.0,8.0,5.0,6.0
2,Yes,5.0,8.0,9.0,8.0,5.0,7.0
3,Yes,7.0,6.0,8.0,7.0,6.0,8.0
4,Yes,5.0,6.0,7.0,7.0,6.0,6.0


In [23]:
df_grouped = df_attributes.groupby("dec")[["attr", "sinc", "intel", "fun", "amb", "shar"]].mean().reset_index()
display(df_grouped.head())

Unnamed: 0,dec,attr,sinc,intel,fun,amb,shar
0,No,5.392181,6.865021,7.072119,5.773148,6.54177,4.908025
1,Yes,7.281239,7.589824,7.747157,7.327032,7.148238,6.416572


In [24]:
attr_transcode = {
    "attr":"Attractive",
    "sinc":"Sincere",
    "intel":"Intelligent",
    "fun":"Fun",
    "amb":"Ambitious",
    "shar":"Shared interests"
}

df_attribute_decision = df_grouped.melt(id_vars=["dec"], value_vars=["attr", "sinc", "intel", "fun", "amb", "shar"], var_name="attribute")
df_attribute_decision["attribute"] = df_attribute_decision["attribute"].transform(lambda x : attr_transcode.get(x))
display(df_attribute_decision.head())

Unnamed: 0,dec,attribute,value
0,No,Attractive,5.392181
1,Yes,Attractive,7.281239
2,No,Sincere,6.865021
3,Yes,Sincere,7.589824
4,No,Intelligent,7.072119


In [32]:
fig_1 = px.bar(
    df_attribute_decision,
    x="attribute",
    y="value",
    color="dec",
    barmode='group',
    text="value",
    color_discrete_map={"No":px.colors.qualitative.Plotly[1], "Yes":px.colors.qualitative.G10[0]}
)

fig_1.update_layout(
    title="Average attribute score according to decision",
    xaxis_title="Attribute",
    yaxis_title="Average score (out of 10)",
    legend_title="Decision"
)

fig_1.update_traces(
    texttemplate="%{text:.3}"
)

fig_1.show()

Firstly, we see that for all attributes, the average score is higher when there was a desire for a second date, than the opposite. However, what is most interesting here is to look at the attribute on which there is the greatest difference, as this highlights the importance of the attribute in question.

With an average of 7.28 points (out of 10) when there's a desire for a second date versus 5.39 when there isn't, it's a person's attractiveness that seems to be the most decisive. Given these figures, we're going to push our analysis further on this attribute.

In [26]:
df_attractive = dataset.groupby(pd.cut(dataset["attr"], np.arange(0, 10.5, 1)), observed=False)[["dec"]].mean().reset_index()
df_attractive["attr"] = [x+1 for x in range(10)]
df_attractive["dec"] = df_attractive["dec"].apply(lambda x : x*100)

fig_3 = px.box(
    df_attributes[["dec", "attr"]],
    y="attr",
    color="dec",
    color_discrete_map={"No":px.colors.qualitative.Plotly[1], "Yes":px.colors.qualitative.G10[0]}
)

fig_3.update_layout(
    title="Attractivness score distribution according to decision",
    yaxis_title="Attractiveness score",
    legend_title="Decision"
)

fig_4 = px.scatter(
    df_attractive,
    x="attr",
    y="dec",
    trendline="ols",
    color_discrete_sequence=px.colors.qualitative.G10
)

fig_4.update_layout(
    title="Positive decision pourcentage according to attractiveness score",
    xaxis_title="Attractiveness score (out of 10)",
    yaxis_title="Positive decision pourcentage (%)"
)

fig_4.add_annotation(
    x=10.5, y=84.8,
    text=f"R2 : {round(px.get_trendline_results(fig_4).px_fit_results.iloc[0].rsquared, 2)}",
    showarrow=False
)

fig_3.show()
fig_4.show()

In this first graph, we're no longer just interested in an average score, but in the distribution of all scores according to the decision to see the partner again. We can clearly see a difference between the two boxes. When there's a desire for a second date, 75% of people have an attractiveness score higher than 6 and this rises to 7 for 50% of them, the box is tightened (between 6 and 8). In the opposite case, the box is wider and lower, with 50% of ratings between 4 and 7 (7 being only the median in the case of a second date). There are 2 counter-examples in the first box, i.e. people who received a low attractiveness score but still wanted a second date, but these are outliers.

We can clearly see that the higher the score, the greater the chance that a second date will be proposed. From a score of 7, there's already more than a 50% chance, for a score of 8 more than 70% and this rises to 80% when a person has an attractiveness score of 9.

There is one attribute in which participants place the most importance, and that is attractiveness.