Gender distribution among participants:

import pandas as pd
import plotly.graph_objects as go

gender_counts = data['gender'].value_counts()
fig = go.Figure(go.Pie(labels=['Male', 'Female'], values=gender_counts, hole=0.4))
fig.update_layout(title='Gender Distribution Among Participants')
fig.show()

Average age of participants:

import plotly.express as px


fig = px.histogram(data, x='age', nbins=20)
fig.update_layout(title='Age Distribution of Participants', xaxis_title='Age', yaxis_title='Count')
fig.show()

Most important attributes for participants

attributes = ['attr1_1', 'sinc1_1', 'intel1_1', 'fun1_1', 'amb1_1', 'shar1_1']
attribute_labels = ['Attractive', 'Sincere', 'Intelligent', 'Fun', 'Ambitious', 'Shared Interests']

attr_means = data[attributes].mean()
fig = go.Figure([go.Bar(x=attribute_labels, y=attr_means, text=attr_means, textposition='auto')])
fig.update_layout(title='Importance of Attributes', xaxis_title='Attributes', yaxis_title='Mean Importance')
fig.show()

Attribute preferences by gender:

grouped_data = data.groupby('gender')[attributes].mean().reset_index()

fig = go.Figure()

for i, attr_label in enumerate(attribute_labels):
    fig.add_trace(go.Bar(x=['Female', 'Male'], y=grouped_data[attributes[i]], name=attr_label))

fig.update_layout(title='Attribute Preferences by Gender', xaxis_title='Gender', yaxis_title='Mean Importance', barmode='group')
fig.show()


Match rates by gender:

match_by_gender = data.groupby('gender')['match'].mean() * 100
fig = go.Figure([go.Bar(x=['Female', 'Male'], y=match_by_gender, text=match_by_gender, textposition='auto')])
fig.update_layout(title='Match Rates by Gender', xaxis_title='Gender', yaxis_title='Match Rate (%)')
fig.show()


Distribution of race among participants:

race_labels = ['Black', 'White', 'Hispanic', 'Asian', 'Other']
race_counts = data['race'].value_counts()

fig = go.Figure(go.Pie(labels=race_labels, values=race_counts, hole=0.4))
fig.update_layout(title='Race Distribution Among Participants')
fig.show()


Distribution of field of study among participants:

field_counts = data['field_cd'].value_counts()

fig = go.Figure(go.Bar(x=field_counts.index, y=field_counts))
fig.update_layout(title='Field of Study Distribution', xaxis_title='Field Code', yaxis_title='Count')
fig.show()


Participants' income distribution:

fig = px.histogram(data, x='income', nbins=20)
fig.update_layout(title='Income Distribution of Participants', xaxis_title='Income', yaxis_title='Count')
fig.show()

Subplots: Age and income distribution:

from plotly.subplots import make_subplots

age_hist = px.histogram(data, x='age', nbins=20)
income_hist = px.histogram(data, x='income', nbins=20)

fig = make_subplots(rows=1, cols=2, subplot_titles=('Age Distribution', 'Income Distribution'))
fig.add_trace(age_hist['data'][0], row=1, col=1)
fig.add_trace(income_hist['data'][0], row=1, col=2)
fig.update_layout(title='Age and Income Distribution of Participants')
fig.show()


Heatmap of attribute preferences by race:

import plotly.figure_factory as ff

race_labels = ['Black', 'White', 'Hispanic', 'Asian', 'Other']
attribute_labels = ['Attractive', 'Sincere', 'Intelligent', 'Fun', 'Ambitious', 'Shared Interests']

attributes = ['attr1_1', 'sinc1_1', 'intel1_1', 'fun1_1', 'amb1_1', 'shar1_1']
grouped_data_by_race = data.groupby('race')[attributes].mean().reset_index()

fig = ff.create_annotated_heatmap(z=grouped_data_by_race[attributes].values, x=attribute_labels, y=race_labels, colorscale='Viridis')
fig.update_layout(title='Attribute Preferences by Race', xaxis_title='Attributes', yaxis_title='Race')
fig.show()


4- Here are three A/B test suggestions from this dataset:

A/B Test 1: Test if there is a difference in match rates between different participant age groups.

A: Participants below the median age
B: Participants above the median age
A/B Test 2: Test if there is a difference in match rates between participants with different income levels.

A: Participants with income below the median income
B: Participants with income above the median income
A/B Test 3: Test if there is a difference in match rates between participants who prioritize attraction and those who prioritize sincerity.

A: Participants with a higher preference for attraction than sincerity
B: Participants with a higher preference for sincerity than attraction

# Calculate the average attribute ratings for matches and non-matches

Conclusion 1: Participants who rated their partners higher in attributes such as attraction, sincerity, intelligence, fun, and shared interests were more likely to receive a match.


matches = data[data['match'] == 1]
non_matches = data[data['match'] == 0]
attribute_columns = ['attr', 'sinc', 'intel', 'fun', 'shar']

avg_attr_matches = matches[attribute_columns].mean()
avg_attr_non_matches = non_matches[attribute_columns].mean()

print("Average attribute ratings for matches:\n", avg_attr_matches)
print("\nAverage attribute ratings for non-matches:\n", avg_attr_non_matches)

# Compare the average attribute ratings
higher_ratings = avg_attr_matches > avg_attr_non_matches
conclusion_1_supported = all(higher_ratings)

if conclusion_1_supported:
    print("\nConclusion 1: Participants who rated their partners higher in attributes were more likely to receive a match.")
else:
    print("\nConclusion 1: The data does not fully support the hypothesis.")


# Calculate the average attribute preference correlation for matches and non-matches

Conclusion 2: Participants who had similar preferences in attributes had a higher chance of matching with each other. 
code the conclusions

int_corr_matches = matches['int_corr'].mean()
int_corr_non_matches = non_matches['int_corr'].mean()

print("Average attribute preference correlation for matches:", int_corr_matches)
print("Average attribute preference correlation for non-matches:", int_corr_non_matches)

# Compare the average attribute preference correlation
conclusion_2_supported = int_corr_matches > int_corr_non_matches

if conclusion_2_supported:
    print("\nConclusion 2: Participants who had similar preferences in attributes had a higher chance of matching with each other.")
else:
    print("\nConclusion 2: The data does not fully support the hypothesis.")

Here's a code snippet to create a Plotly pie chart to visualize the proportion of participants who accepted a second date, grouped by age:

import plotly.express as px

# Create a new column that indicates whether a participant accepted a second date or not
data['accepted_second_date'] = data['date_3'].apply(lambda x: 'Accepted' if x == 1 else 'Rejected')

# Group the data by age and count the occurrences of 'Accepted' and 'Rejected' for each age group
age_second_date_counts = data.groupby(['age', 'accepted_second_date']).size().reset_index(name='counts')

# Create a pie chart using Plotly Express
fig = px.pie(age_second_date_counts, values='counts', names='accepted_second_date', title='Second Date Acceptance by Age', color_discrete_sequence=['green', 'red'])
fig.show()


Accepted a second date related with the profession in plotly pies:
    
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Group the data by career_c and count the occurrences of 'Accepted' and 'Rejected' for each career category
career_second_date_counts = data.groupby(['career_c', 'accepted_second_date']).size().reset_index(name='counts')

# Find the unique career categories
unique_careers = career_second_date_counts['career_c'].unique()

# Create a subplot for each career category
rows = 4
cols = 4
subplot_titles = [f"Career {int(c)}" for c in unique_careers]
fig = make_subplots(rows=rows, cols=cols, specs=[[{'type': 'domain'}] * cols] * rows, subplot_titles=subplot_titles)

for i, career in enumerate(unique_careers, 1):
    career_data = career_second_date_counts[career_second_date_counts['career_c'] == career]
    pie = go.Pie(labels=career_data['accepted_second_date'], values=career_data['counts'], name=f"Career {int(career)}")
    fig.add_trace(pie, row=((i - 1) // cols) + 1, col=((i - 1) % cols) + 1)

fig.update_layout(title='Second Date Acceptance by Profession')
fig.show()


To perform A/B tests, we will use the statsmodels.stats.proportion library. Here are three A/B test examples based on the dataset:

Test whether the proportion of matches differs between participants who have the same race and those who have different races.
Test whether the proportion of matches differs between participants who have the same field of study and those who have different fields of study.
Test whether the proportion of matches differs between participants who rated their partner's attractiveness above the median and those who rated it below the median.



import numpy as np
import pandas as pd
from statsmodels.stats.proportion import proportions_ztest

# A/B Test 1: Same race vs different race
same_race = data[data['samerace'] == 1]
diff_race = data[data['samerace'] == 0]

count_same_race = np.sum(same_race['match'])
count_diff_race = np.sum(diff_race['match'])

n_same_race = len(same_race)
n_diff_race = len(diff_race)

stat, pval = proportions_ztest([count_same_race, count_diff_race], [n_same_race, n_diff_race])
print("A/B Test 1 - Same race vs different race:")
print(f"z-statistic: {stat}, p-value: {pval}\n")

# A/B Test 2: Same field of study vs different field of study
data['same_field'] = data['field_cd'] == data['partner_field_cd']
same_field = data[data['same_field']]
diff_field = data[~data['same_field']]

count_same_field = np.sum(same_field['match'])
count_diff_field = np.sum(diff_field['match'])

n_same_field = len(same_field)
n_diff_field = len(diff_field)

stat, pval = proportions_ztest([count_same_field, count_diff_field], [n_same_field, n_diff_field])
print("A/B Test 2 - Same field of study vs different field of study:")
print(f"z-statistic: {stat}, p-value: {pval}\n")

# A/B Test 3: Attractiveness above median vs below median
median_attr = data['attr'].median()
high_attr = data[data['attr'] > median_attr]
low_attr = data[data['attr'] <= median_attr]

count_high_attr = np.sum(high_attr['match'])
count_low_attr = np.sum(low_attr['match'])

n_high_attr = len(high_attr)
n_low_attr = len(low_attr)

stat, pval = proportions_ztest([count_high_attr, count_low_attr], [n_high_attr, n_low_attr])
print("A/B Test 3 - Attractiveness above median vs below median:")
print(f"z-statistic: {stat}, p-value: {pval}\n")



ab_test_results = []

# A/B Test 1: Same race vs different race
stat, pval = proportions_ztest([count_same_race, count_diff_race], [n_same_race, n_diff_race])
ab_test_results.append({"Test": "Same race vs different race", "z-statistic": stat, "p-value": pval})

# A/B Test 2: Same field of study vs different field of study
stat, pval = proportions_ztest([count_same_field, count_diff_field], [n_same_field, n_diff_field])
ab_test_results.append({"Test": "Same field of study vs different field of study", "z-statistic": stat, "p-value": pval})

# A/B Test 3: Attractiveness above median vs below median
stat, pval = proportions_ztest([count_high_attr, count_low_attr], [n_high_attr, n_low_attr])
ab_test_results.append({"Test": "Attractiveness above median vs below median", "z-statistic": stat, "p-value": pval})

# Create a pandas DataFrame from the results
ab_test_results_df = pd.DataFrame(ab_test_results)
print(ab_test_results_df)


Same ab test different library: 

import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency

ab_test_results = []

# A/B Test 1: Same race vs different race
contingency_table_1 = np.array([[count_same_race, n_same_race - count_same_race], 
                                [count_diff_race, n_diff_race - count_diff_race]])

chi2, pval, _, _ = chi2_contingency(contingency_table_1)
ab_test_results.append({"Test": "Same race vs different race", "Chi2": chi2, "p-value": pval})

# A/B Test 2: Same field of study vs different field of study
contingency_table_2 = np.array([[count_same_field, n_same_field - count_same_field], 
                                [count_diff_field, n_diff_field - count_diff_field]])

chi2, pval, _, _ = chi2_contingency(contingency_table_2)
ab_test_results.append({"Test": "Same field of study vs different field of study", "Chi2": chi2, "p-value": pval})

# A/B Test 3: Attractiveness above median vs below median
contingency_table_3 = np.array([[count_high_attr, n_high_attr - count_high_attr], 
                                [count_low_attr, n_low_attr - count_low_attr]])

chi2, pval, _, _ = chi2_contingency(contingency_table_3)
ab_test_results.append({"Test": "Attractiveness above median vs below median", "Chi2": chi2, "p-value": pval})

# Create a pandas DataFrame from the results
ab_test_results_df = pd.DataFrame(ab_test_results)
print(ab_test_results_df)


The underscores in the code snippet are placeholders used when unpacking the values 
returned by the chi2_contingency function. The function returns four values: 
    the chi-squared test statistic, the p-value, the degrees of freedom, and the expected frequencies. 
    In this case, we are only interested in the chi-squared test statistic and the p-value, 
    so we use underscores as placeholders for the degrees of freedom and the expected frequencies to ignore them.

chi2, pval, degrees_of_freedom, expected_frequencies = chi2_contingency(contingency_table)
