In [29]:
from IPython.display import display, HTML
import pandas as pd

df = pd.read_csv('dating-full.csv')

In [30]:
count = 0

def strip_quotes(s: str) -> str:
    global count
    is_modified = False
    if s[0] == '"' or s[0] == "'":
        s = s[1:]
        is_modified = True
    if s[-1] == '"' or s[-1] == "'":
        s = s[:-1]
        is_modified = True
    if is_modified:
        count += 1
    return s


cols_need_striped = ('race', 'race_o', 'field')
for col in cols_need_striped:
    df[col] = df[col].map(strip_quotes)

print(f'Quotes removed from {count} cells.')

Quotes removed from 8316 cells.


In [31]:
count = 0

def lowercase(s: str) -> str:
    global count
    original = s
    s = s.lower()
    if s != original:
        count += 1
    return s

cols_need_lowercased = ('field',)
for col in cols_need_lowercased:
    df[col] = df[col].map(lowercase)

print(f'Standardized {count} cells to lower case.')

Standardized 5707 cells to lower case.


In [32]:
def compile_encoding(series: pd.Series) -> dict:
    return {v: i for i, v in enumerate(sorted(set(series)))}

encodings = {}

cols_need_encoded = ('gender', 'race', 'race_o', 'field')
for col in cols_need_encoded:
    encodings[col] = compile_encoding(df[col])
    df[col] = df[col].map(encodings[col])

def print_encoding(col: str, label: str):
    print(f'Value assigned for {label} in column {col}: {encodings[col][label]}')

print_encoding('gender', 'male')
print_encoding('race', 'European/Caucasian-American')
print_encoding('race_o', 'Latino/Hispanic American')
print_encoding('field', 'law')

Value assigned for male in column gender: 1
Value assigned for European/Caucasian-American in column race: 2
Value assigned for Latino/Hispanic American in column race_o: 3
Value assigned for law in column field: 121


In [33]:
preference_scores_of_participant = (['attractive_important', 'sincere_important', 'intelligence_important',
'funny_important', 'ambition_important', 'shared_interests_important'])

preference_scores_of_partner = (['pref_o_attractive', 'pref_o_sincere', 'pref_o_intelligence', 'pref_o_funny',
'pref_o_ambitious', 'pref_o_shared_interests'])

def normalize_preference_scores(series: pd.Series) -> pd.Series:
    total_participant_score = sum(series[col] for col in preference_scores_of_participant)
    for col in preference_scores_of_participant:
        series[col] = series[col] / total_participant_score

    total_partner_score = sum(series[col] for col in preference_scores_of_partner)
    for col in preference_scores_of_partner:
        series[col] = series[col] / total_partner_score
    return series

df = df.apply(normalize_preference_scores, axis=1)

def print_mean(col: str):
    print(f'Mean of column {col}: {df[col].mean(): .2f}')

for col in preference_scores_of_participant:
    print_mean(col)
for col in preference_scores_of_partner:
    print_mean(col)

Mean of column attractive_important:  0.22
Mean of column sincere_important:  0.17
Mean of column intelligence_important:  0.20
Mean of column funny_important:  0.17
Mean of column ambition_important:  0.11
Mean of column shared_interests_important:  0.12
Mean of column pref_o_attractive:  0.22
Mean of column pref_o_sincere:  0.17
Mean of column pref_o_intelligence:  0.20
Mean of column pref_o_funny:  0.17
Mean of column pref_o_ambitious:  0.11
Mean of column pref_o_shared_interests:  0.12
