In [1]:
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
ok_cupid_df = pd.read_csv('data/okcupid_profiles.csv')
ok_cupid_df.columns

Index(['age', 'status', 'sex', 'orientation', 'body_type', 'diet', 'drinks',
       'drugs', 'education', 'ethnicity', 'height', 'income', 'job',
       'last_online', 'location', 'offspring', 'pets', 'religion', 'sign',
       'smokes', 'speaks', 'essay0', 'essay1', 'essay2', 'essay3', 'essay4',
       'essay5', 'essay6', 'essay7', 'essay8', 'essay9'],
      dtype='object')

In [3]:
ok_cupid_df.shape

(59946, 31)

In [4]:
ok_cupid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59946 entries, 0 to 59945
Data columns (total 31 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          59946 non-null  int64  
 1   status       59946 non-null  object 
 2   sex          59946 non-null  object 
 3   orientation  59946 non-null  object 
 4   body_type    54650 non-null  object 
 5   diet         35551 non-null  object 
 6   drinks       56961 non-null  object 
 7   drugs        45866 non-null  object 
 8   education    53318 non-null  object 
 9   ethnicity    54266 non-null  object 
 10  height       59943 non-null  float64
 11  income       59946 non-null  int64  
 12  job          51748 non-null  object 
 13  last_online  59946 non-null  object 
 14  location     59946 non-null  object 
 15  offspring    24385 non-null  object 
 16  pets         40025 non-null  object 
 17  religion     39720 non-null  object 
 18  sign         48890 non-null  object 
 19  smok

In [5]:
ok_cupid_df.head(2)

Unnamed: 0,age,status,sex,orientation,body_type,diet,drinks,drugs,education,ethnicity,...,essay0,essay1,essay2,essay3,essay4,essay5,essay6,essay7,essay8,essay9
0,22,single,m,straight,a little extra,strictly anything,socially,never,working on college/university,"asian, white",...,about me: i would love to think that i was so...,currently working as an international agent fo...,making people laugh. ranting about a good salt...,"the way i look. i am a six foot half asian, ha...","books: absurdistan, the republic, of mice and ...",food. water. cell phone. shelter.,duality and humorous things,trying to find someone to hang out with. i am ...,i am new to california and looking for someone...,you want to be swept off your feet! you are ti...
1,35,single,m,straight,average,mostly other,often,sometimes,working on space camp,white,...,i am a chef: this is what that means. 1. i am ...,dedicating everyday to being an unbelievable b...,being silly. having ridiculous amonts of fun w...,,i am die hard christopher moore fan. i don't r...,delicious porkness in all of its glories. my b...,,,i am very open and will share just about anyth...,


### Check null values:

In [6]:
ok_cupid_df.isna().sum()

age                0
status             0
sex                0
orientation        0
body_type       5296
diet           24395
drinks          2985
drugs          14080
education       6628
ethnicity       5680
height             3
income             0
job             8198
last_online        0
location           0
offspring      35561
pets           19921
religion       20226
sign           11056
smokes          5512
speaks            50
essay0          5488
essay1          7572
essay2          9638
essay3         11476
essay4         10537
essay5         10850
essay6         13771
essay7         12451
essay8         19225
essay9         12603
dtype: int64

In [7]:
ok_cupid_df.isna().sum()/ok_cupid_df.shape[0]*100

age             0.000000
status          0.000000
sex             0.000000
orientation     0.000000
body_type       8.834618
diet           40.694959
drinks          4.979482
drugs          23.487806
education      11.056618
ethnicity       9.475194
height          0.005005
income          0.000000
job            13.675641
last_online     0.000000
location        0.000000
offspring      59.321723
pets           33.231575
religion       33.740366
sign           18.443266
smokes          9.194942
speaks          0.083408
essay0          9.154906
essay1         12.631368
essay2         16.077803
essay3         19.143896
essay4         17.577486
essay5         18.099623
essay6         22.972342
essay7         20.770360
essay8         32.070530
essay9         21.023922
dtype: float64

### Fill null values:

In [8]:
ok_cupid_df['drugs'] = ok_cupid_df['drugs'].fillna('unknown_drugs')
ok_cupid_df['drugs'].value_counts()

never            37724
unknown_drugs    14080
sometimes         7732
often              410
Name: drugs, dtype: int64

In [9]:
ok_cupid_df['diet'] = ok_cupid_df['diet'].fillna('unknowndiet')
ok_cupid_df['diet'].value_counts()

unknowndiet            24395
mostly anything        16585
anything                6183
strictly anything       5113
mostly vegetarian       3444
mostly other            1007
strictly vegetarian      875
vegetarian               667
strictly other           452
mostly vegan             338
other                    331
strictly vegan           228
vegan                    136
mostly kosher             86
mostly halal              48
strictly halal            18
strictly kosher           18
kosher                    11
halal                     11
Name: diet, dtype: int64

In [10]:
#ok_cupid_df.loc[(ok_cupid_df['diet'] == 'unknown')&(ok_cupid_df['essay0'] == '57'), 'status'] = 'available'
ok_cupid_df.loc[ok_cupid_df['essay0'] == "im looking for someone to share some raging adhd. im a self motivated and light hearted superhero who enjoy's riding my bike everywhere and eating every goddamn thing i can.  im looking for someone to go adventuring with. i enjoy blind drunken adventures sometimes but you dont have to be a drinker. no vegans, i will eat anything... including people... especially hipsters. im not really a nerd (i don't play magic cards/excessive videogames) but i can like nerdy girls.  i just got this account, so gimmie some time to write down more shenanigans that are important  if u make chiptunes hit me the fuck up! i wanna make some!  i am awesome, eccentric, and energetic", 'diet'] = 'strictly anything'

In [11]:
ok_cupid_df.loc[ok_cupid_df['essay0'] == "rabid bibliophile, humorless feminist (that's a joke), eternal student. i like to write poetry on people, bake (vegan) cupcakes, make art and dress-up.  i identify as queer but my choices here are limited so i chose bisexual.  i am quiet, empathetic, and geeky", 'diet'] = 'vegan'

In [12]:
#ok_cupid_df[ok_cupid_df['diet'] == 'unknown_diet']['essay0'].tolist()

In [13]:
ok_cupid_df['status'].value_counts()

single            55697
seeing someone     2064
available          1865
married             310
unknown              10
Name: status, dtype: int64

In [14]:
ok_cupid_df["status"].replace({'unknown': 'unknown_status'}, inplace=True)

In [15]:
ok_cupid_df['status'].value_counts()

single            55697
seeing someone     2064
available          1865
married             310
unknown_status       10
Name: status, dtype: int64

In [16]:
ok_cupid_df['body_type'] = ok_cupid_df['body_type'].fillna('unknown_body_type')
ok_cupid_df['body_type'].value_counts()

average              14652
fit                  12711
athletic             11819
unknown_body_type     5296
thin                  4711
curvy                 3924
a little extra        2629
skinny                1777
full figured          1009
overweight             444
jacked                 421
used up                355
rather not say         198
Name: body_type, dtype: int64

In [17]:
ok_cupid_df['education'] = ok_cupid_df['education'].fillna('unknown_education')
ok_cupid_df['education'].value_counts()

graduated from college/university    23959
graduated from masters program        8961
unknown_education                     6628
working on college/university         5712
working on masters program            1683
graduated from two-year college       1531
graduated from high school            1428
graduated from ph.d program           1272
graduated from law school             1122
working on two-year college           1074
dropped out of college/university      995
working on ph.d program                983
college/university                     801
graduated from space camp              657
dropped out of space camp              523
graduated from med school              446
working on space camp                  445
working on law school                  269
two-year college                       222
working on med school                  212
dropped out of two-year college        191
dropped out of masters program         140
masters program                        136
dropped out

In [18]:
ok_cupid_df['job'] = ok_cupid_df['job'].fillna('unknown_job')
#ok_cupid_df['job'].value_counts()

In [19]:
ok_cupid_df['ethnicity'] = ok_cupid_df['ethnicity'].fillna('unknown_ethnicity')
#ok_cupid_df['ethnicity'].value_counts()

In [20]:
ok_cupid_df['offspring'] = ok_cupid_df['offspring'].fillna('unknown_offspring')
#ok_cupid_df['offspring'].value_counts()

In [21]:
ok_cupid_df['pets'] = ok_cupid_df['pets'].fillna('unknown_pets')
#ok_cupid_df['pets'].value_counts()

In [22]:
ok_cupid_df['religion'] = ok_cupid_df['religion'].fillna('unknown_religion')
#ok_cupid_df['religion'].value_counts()

In [23]:
ok_cupid_df['sign'] = ok_cupid_df['sign'].fillna('unknown_sign')
#ok_cupid_df['sign'].value_counts()

In [24]:
ok_cupid_df['smokes'] = ok_cupid_df['smokes'].fillna('unknown_smokes')
ok_cupid_df['smokes'].value_counts()

no                43896
unknown_smokes     5512
sometimes          3787
when drinking      3040
yes                2231
trying to quit     1480
Name: smokes, dtype: int64

In [25]:
ok_cupid_df['drinks'] = ok_cupid_df['drinks'].fillna('unknown_drinks')
ok_cupid_df['drinks'].value_counts()

socially          41780
rarely             5957
often              5164
not at all         3267
unknown_drinks     2985
very often          471
desperately         322
Name: drinks, dtype: int64

In [26]:
ok_cupid_df['speaks'] = ok_cupid_df['speaks'].fillna('unknown_speaks')
ok_cupid_df['speaks'].value_counts()

english                                                                                           21828
english (fluently)                                                                                 6628
english (fluently), spanish (poorly)                                                               2059
english (fluently), spanish (okay)                                                                 1917
english (fluently), spanish (fluently)                                                             1288
                                                                                                  ...  
english (fluently), farsi (fluently), urdu (okay), other (fluently)                                   1
english (fluently), spanish (fluently), portuguese (fluently), french (okay), russian (poorly)        1
english (fluently), icelandic (okay), thai (okay)                                                     1
english, spanish, hindi, tamil                                  

In [27]:
ok_cupid_df[ok_cupid_df['height'].isna()]

Unnamed: 0,age,status,sex,orientation,body_type,diet,drinks,drugs,education,ethnicity,...,essay0,essay1,essay2,essay3,essay4,essay5,essay6,essay7,essay8,essay9
36428,32,single,f,straight,unknown_body_type,unknowndiet,unknown_drinks,unknown_drugs,unknown_education,other,...,,,,,"thomas bernhard, foucault, annie hall, taxi dr...",,consciousness,,i passionately hate liars!,you know what my user name means and if you ar...
54002,25,single,m,straight,unknown_body_type,unknowndiet,unknown_drinks,never,unknown_education,hispanic / latin,...,,,,,,,,,,
58983,49,single,m,straight,unknown_body_type,unknowndiet,unknown_drinks,unknown_drugs,unknown_education,unknown_ethnicity,...,"great guy, lots of positive attributes*, but s...",living it. quite a bit more than that - more ...,"lots, notably good, deep, excellent communicat...","some positive stuff, but i'll hold my tongue o...",lots. not especially up to listing 'em here a...,"1. damn good friend, or better 2. managing to ...",many things. maybe too much. not really up for...,"at the moment, i'd rather not even say or thin...",i have a blog of much that's personal and priv...,you've good reason to think we'd like make at ...


In [28]:
ok_cupid_df.isna().sum()

age                0
status             0
sex                0
orientation        0
body_type          0
diet               0
drinks             0
drugs              0
education          0
ethnicity          0
height             3
income             0
job                0
last_online        0
location           0
offspring          0
pets               0
religion           0
sign               0
smokes             0
speaks             0
essay0          5488
essay1          7572
essay2          9638
essay3         11476
essay4         10537
essay5         10850
essay6         13771
essay7         12451
essay8         19225
essay9         12603
dtype: int64

### Check for duplicalted rows:

In [29]:
ok_cupid_df[ok_cupid_df.duplicated()]

Unnamed: 0,age,status,sex,orientation,body_type,diet,drinks,drugs,education,ethnicity,...,essay0,essay1,essay2,essay3,essay4,essay5,essay6,essay7,essay8,essay9


### Transform essays:

In [30]:
ok_cupid_df['essay0'] = ok_cupid_df['essay0'].fillna('')
ok_cupid_df['essay1'] = ok_cupid_df['essay1'].fillna('')
ok_cupid_df['essay2'] = ok_cupid_df['essay2'].fillna('')
ok_cupid_df['essay3'] = ok_cupid_df['essay3'].fillna('')
ok_cupid_df['essay4'] = ok_cupid_df['essay4'].fillna('')
ok_cupid_df['essay5'] = ok_cupid_df['essay5'].fillna('')
ok_cupid_df['essay6'] = ok_cupid_df['essay6'].fillna('')
ok_cupid_df['essay7'] = ok_cupid_df['essay7'].fillna('')
ok_cupid_df['essay8'] = ok_cupid_df['essay8'].fillna('')
ok_cupid_df['essay9'] = ok_cupid_df['essay9'].fillna('')

In [31]:
ok_cupid_df['essay0'].str.len()

0        1251
1         661
2        1392
3          41
4         210
         ... 
59941     269
59942     538
59943     963
59944     218
59945     975
Name: essay0, Length: 59946, dtype: int64

In [32]:
ok_cupid_df['essay0'] = ok_cupid_df['essay0'].str.len()
ok_cupid_df['essay1'] = ok_cupid_df['essay1'].str.len()
ok_cupid_df['essay2'] = ok_cupid_df['essay2'].str.len()
ok_cupid_df['essay3'] = ok_cupid_df['essay3'].str.len()
ok_cupid_df['essay4'] = ok_cupid_df['essay4'].str.len()
ok_cupid_df['essay5'] = ok_cupid_df['essay5'].str.len()
ok_cupid_df['essay6'] = ok_cupid_df['essay6'].str.len()
ok_cupid_df['essay7'] = ok_cupid_df['essay7'].str.len()
ok_cupid_df['essay8'] = ok_cupid_df['essay8'].str.len()
ok_cupid_df['essay9'] = ok_cupid_df['essay9'].str.len()

In [33]:
#ok_cupid_df = ok_cupid_df.drop(columns=['essay0', 'essay1', 'essay2', 'essay3', 'essay4', 'essay5', 'essay6', 'essay7', 'essay8', 'essay9'])

In [34]:
ok_cupid_df.isna().sum()

age            0
status         0
sex            0
orientation    0
body_type      0
diet           0
drinks         0
drugs          0
education      0
ethnicity      0
height         3
income         0
job            0
last_online    0
location       0
offspring      0
pets           0
religion       0
sign           0
smokes         0
speaks         0
essay0         0
essay1         0
essay2         0
essay3         0
essay4         0
essay5         0
essay6         0
essay7         0
essay8         0
essay9         0
dtype: int64

### Drop 3 rows with null values:

In [35]:
ok_cupid_df.dropna(inplace=True)

#### Check duplicated columns:

In [36]:
ok_cupid_df.columns.duplicated()

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False])

In [37]:
ok_cupid_df[ok_cupid_df.duplicated()]

Unnamed: 0,age,status,sex,orientation,body_type,diet,drinks,drugs,education,ethnicity,...,essay0,essay1,essay2,essay3,essay4,essay5,essay6,essay7,essay8,essay9


In [38]:
ok_cupid_df['income'].value_counts()

-1          48439
 20000       2952
 100000      1621
 80000       1111
 30000       1048
 40000       1005
 50000        975
 60000        736
 70000        707
 150000       631
 1000000      521
 250000       149
 500000        48
Name: income, dtype: int64

In [39]:
ok_cupid_df.shape

(59943, 31)

In [40]:
#for column in ok_cupid_df.columns:
    
#    plt.figure()
#    plt.hist(ok_cupid_df[column], bins=25)
#    plt.title(f'Histogram of {column}')
#    plt.show()

#### Add binary column for gender (and drop existing sex column):

In [41]:
# Add new binary column for gender
ok_cupid_df["male"] = np.where(ok_cupid_df["sex"]=="m", 1, 0)

# Drop the sex column
ok_cupid_df.drop(columns="sex", inplace=True)
ok_cupid_df.head(2)

Unnamed: 0,age,status,orientation,body_type,diet,drinks,drugs,education,ethnicity,height,...,essay1,essay2,essay3,essay4,essay5,essay6,essay7,essay8,essay9,male
0,22,single,straight,a little extra,strictly anything,socially,never,working on college/university,"asian, white",75.0,...,253,114,124,381,33,27,78,71,135,1
1,35,single,straight,average,mostly other,often,sometimes,working on space camp,white,70.0,...,52,96,0,249,276,0,0,50,0,1


In [42]:
ok_cupid_df.reset_index(drop=True, inplace=True)

## Encoding:

In [43]:
from sklearn.preprocessing import OneHotEncoder

# Instantiate OneHotEncoder
orientation_ohe = OneHotEncoder()
status_ohe = OneHotEncoder()
job_ohe = OneHotEncoder()

# Fit the OneHotEncoder and transform
orientation = pd.DataFrame(ok_cupid_df['orientation'])
orientation_encoded = orientation_ohe.fit_transform(orientation)
display(orientation_encoded)

status = pd.DataFrame(ok_cupid_df['status'])
status_encoded = status_ohe.fit_transform(status)
display(status_encoded)

job = pd.DataFrame(ok_cupid_df['job'])
job_encoded = job_ohe.fit_transform(job)
display(job_encoded)

<59943x3 sparse matrix of type '<class 'numpy.float64'>'
	with 59943 stored elements in Compressed Sparse Row format>

<59943x5 sparse matrix of type '<class 'numpy.float64'>'
	with 59943 stored elements in Compressed Sparse Row format>

<59943x22 sparse matrix of type '<class 'numpy.float64'>'
	with 59943 stored elements in Compressed Sparse Row format>

In [44]:
# Put into a dataframe to get column names
encoded_df_orientation = pd.DataFrame(orientation_encoded.toarray().astype(int), columns=orientation_ohe.categories_[0], dtype=int)
encoded_df_orientation = encoded_df_orientation.drop(encoded_df_orientation.columns[0], axis=1)
display(encoded_df_orientation.head(2))

# Status
encoded_df_status = pd.DataFrame(status_encoded.toarray().astype(int), columns=status_ohe.categories_[0], dtype=int)
encoded_df_status = encoded_df_status.drop(columns='unknown_status')
display(encoded_df_status.head(2))

# Jobs
encoded_df_job = pd.DataFrame(job_encoded.toarray().astype(int), columns=job_ohe.categories_[0], dtype=int)
encoded_df_job = encoded_df_job.drop(columns='unknown_job')
display(encoded_df_job.head(2))

Unnamed: 0,gay,straight
0,0,1
1,0,1


Unnamed: 0,available,married,seeing someone,single
0,0,0,0,1
1,0,0,0,1


Unnamed: 0,artistic / musical / writer,banking / financial / real estate,clerical / administrative,computer / hardware / software,construction / craftsmanship,education / academia,entertainment / media,executive / management,hospitality / travel,law / legal services,...,military,other,political / government,rather not say,retired,sales / marketing / biz dev,science / tech / engineering,student,transportation,unemployed
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
encoded_df_job.rename(columns= {'other': 'otherjob'}, inplace = True)

In [46]:
# Join back to ok_cupid_df
ok_cupid_df = pd.concat([ok_cupid_df, encoded_df_orientation], axis=1)
ok_cupid_df.drop(columns='orientation', inplace=True)
ok_cupid_df.head(2)

Unnamed: 0,age,status,body_type,diet,drinks,drugs,education,ethnicity,height,income,...,essay3,essay4,essay5,essay6,essay7,essay8,essay9,male,gay,straight
0,22,single,a little extra,strictly anything,socially,never,working on college/university,"asian, white",75.0,-1,...,124,381,33,27,78,71,135,1,0,1
1,35,single,average,mostly other,often,sometimes,working on space camp,white,70.0,80000,...,0,249,276,0,0,50,0,1,0,1


In [47]:
ok_cupid_df = pd.concat([ok_cupid_df, encoded_df_status], axis=1)
ok_cupid_df.drop(columns='status', inplace=True)
ok_cupid_df.head(2)

Unnamed: 0,age,body_type,diet,drinks,drugs,education,ethnicity,height,income,job,...,essay7,essay8,essay9,male,gay,straight,available,married,seeing someone,single
0,22,a little extra,strictly anything,socially,never,working on college/university,"asian, white",75.0,-1,transportation,...,78,71,135,1,0,1,0,0,0,1
1,35,average,mostly other,often,sometimes,working on space camp,white,70.0,80000,hospitality / travel,...,0,50,0,1,0,1,0,0,0,1


In [48]:
ok_cupid_df = pd.concat([ok_cupid_df, encoded_df_job], axis=1)
ok_cupid_df.drop(columns='job', inplace=True)
ok_cupid_df.head(2)

Unnamed: 0,age,body_type,diet,drinks,drugs,education,ethnicity,height,income,last_online,...,military,otherjob,political / government,rather not say,retired,sales / marketing / biz dev,science / tech / engineering,student,transportation,unemployed
0,22,a little extra,strictly anything,socially,never,working on college/university,"asian, white",75.0,-1,2012-06-28-20-30,...,0,0,0,0,0,0,0,0,1,0
1,35,average,mostly other,often,sometimes,working on space camp,white,70.0,80000,2012-06-29-21-41,...,0,0,0,0,0,0,0,0,0,0


In [49]:
#ok_cupid_df.columns

In [50]:
from sklearn.feature_extraction.text import CountVectorizer

In [51]:
# 1. Instantiate 
ethnicity = CountVectorizer()

# 2. Fit 
ethnicity.fit(ok_cupid_df["ethnicity"])

# 3. Transform
ethnicity_transformed = ethnicity.transform(ok_cupid_df["ethnicity"])
ethnicity_transformed

<59943x14 sparse matrix of type '<class 'numpy.int64'>'
	with 78197 stored elements in Compressed Sparse Row format>

In [52]:
ethnicity_transformed.toarray()

array([[0, 1, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 1, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]])

In [53]:
ethnicity_df = pd.DataFrame(columns=ethnicity.get_feature_names(), data=ethnicity_transformed.toarray())

# Drop one column to prevent redundant information
ethnicity_df = ethnicity_df.drop(columns='unknown_ethnicity')

ethnicity_df = ethnicity_df.rename(columns={'american': 'native_american', 'eastern': 'middle_eastern', 'islander': 'pacific_islander', 'hispanic': 'hispanic_latin'})
ethnicity_df = ethnicity_df.drop(columns=['native', 'middle', 'pacific', 'latin'])
ethnicity_df.head(2)

Unnamed: 0,native_american,asian,black,middle_eastern,hispanic_latin,indian,pacific_islander,other,white
0,0,1,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,1


In [54]:
ethnicity_df.rename(columns = {'other':'other_ethnicity'}, inplace = True)

In [55]:
# Join back to ok_cupid_df
ok_cupid_df = pd.concat([ok_cupid_df, ethnicity_df], axis=1)
ok_cupid_df.head(2)

Unnamed: 0,age,body_type,diet,drinks,drugs,education,ethnicity,height,income,last_online,...,unemployed,native_american,asian,black,middle_eastern,hispanic_latin,indian,pacific_islander,other_ethnicity,white
0,22,a little extra,strictly anything,socially,never,working on college/university,"asian, white",75.0,-1,2012-06-28-20-30,...,0,0,1,0,0,0,0,0,0,1
1,35,average,mostly other,often,sometimes,working on space camp,white,70.0,80000,2012-06-29-21-41,...,0,0,0,0,0,0,0,0,0,1


In [56]:
ok_cupid_df.drop(columns='ethnicity', inplace=True)

In [57]:
ok_cupid_df['body_type'].value_counts()

average              14652
fit                  12711
athletic             11819
unknown_body_type     5293
thin                  4711
curvy                 3924
a little extra        2629
skinny                1777
full figured          1009
overweight             444
jacked                 421
used up                355
rather not say         198
Name: body_type, dtype: int64

In [58]:
ok_cupid_df['body_type'] = ok_cupid_df['body_type'].replace(['a little extra', 'full figured'],'curvy')
ok_cupid_df['body_type'] = ok_cupid_df['body_type'].replace(['thin'],'skinny')
ok_cupid_df['body_type'] = ok_cupid_df['body_type'].replace(['fit'],'athletic')
ok_cupid_df['body_type'] = ok_cupid_df['body_type'].replace(['unknown_body_type'],'rather not say')

In [59]:
ok_cupid_df['body_type'].value_counts()

athletic          24530
average           14652
curvy              7562
skinny             6488
rather not say     5491
overweight          444
jacked              421
used up             355
Name: body_type, dtype: int64

In [60]:
body_type_mapping = {'overweight':0, 'curvy':1, 'average':2, 'used up':3, 'rather not say': 4, 'skinny': 5, 'athletic': 6, 'jacked': 7}
body_type_mapped_data = ok_cupid_df['body_type'].map(body_type_mapping)

In [61]:
#ok_cupid_df = ok_cupid_df.drop(columns=['body_type'])
ok_cupid_df.drop(columns='body_type', inplace=True)

ok_cupid_df = pd.concat([ok_cupid_df, body_type_mapped_data], axis=1)
ok_cupid_df.head(2)

Unnamed: 0,age,diet,drinks,drugs,education,height,income,last_online,location,offspring,...,native_american,asian,black,middle_eastern,hispanic_latin,indian,pacific_islander,other_ethnicity,white,body_type
0,22,strictly anything,socially,never,working on college/university,75.0,-1,2012-06-28-20-30,"south san francisco, california","doesn't have kids, but might want them",...,0,1,0,0,0,0,0,0,1,1
1,35,mostly other,often,sometimes,working on space camp,70.0,80000,2012-06-29-21-41,"oakland, california","doesn't have kids, but might want them",...,0,0,0,0,0,0,0,0,1,2


In [62]:
ok_cupid_df['drinks'].value_counts()

socially          41780
rarely             5957
often              5164
not at all         3267
unknown_drinks     2982
very often          471
desperately         322
Name: drinks, dtype: int64

In [63]:
drinks_mapping = {'desperately':6, 'very often':5, 'often':4, 'unknown_drinks':3, 'socially': 2, 'rarely': 1, 'not at all': 0}
drinks_mapped_data = ok_cupid_df['drinks'].map(drinks_mapping)

In [64]:
ok_cupid_df = ok_cupid_df.drop(columns=['drinks'])
ok_cupid_df = pd.concat([ok_cupid_df, drinks_mapped_data], axis=1)
ok_cupid_df.head(2)

Unnamed: 0,age,diet,drugs,education,height,income,last_online,location,offspring,pets,...,asian,black,middle_eastern,hispanic_latin,indian,pacific_islander,other_ethnicity,white,body_type,drinks
0,22,strictly anything,never,working on college/university,75.0,-1,2012-06-28-20-30,"south san francisco, california","doesn't have kids, but might want them",likes dogs and likes cats,...,1,0,0,0,0,0,0,1,1,2
1,35,mostly other,sometimes,working on space camp,70.0,80000,2012-06-29-21-41,"oakland, california","doesn't have kids, but might want them",likes dogs and likes cats,...,0,0,0,0,0,0,0,1,2,4


In [65]:
ok_cupid_df['diet'].value_counts()

unknowndiet            24390
mostly anything        16585
anything                6183
strictly anything       5114
mostly vegetarian       3444
mostly other            1007
strictly vegetarian      875
vegetarian               667
strictly other           452
mostly vegan             338
other                    331
strictly vegan           228
vegan                    137
mostly kosher             86
mostly halal              48
strictly halal            18
strictly kosher           18
kosher                    11
halal                     11
Name: diet, dtype: int64

In [66]:
#ok_cupid_df["diet"] = ok_cupid_df["diet"].replace({'anything': 'strictly anything', 'vegetarian': 'strictly vegetarian', 'other': 'strictly other',
 #                            'vegan': 'strictly vegan', 'kosher': 'strictly kosher', 'halal': 'strictly halal'})
ok_cupid_df['diet'] = ok_cupid_df['diet'].str.replace(' ', '')
ok_cupid_df['diet'].value_counts()

unknowndiet           24390
mostlyanything        16585
anything               6183
strictlyanything       5114
mostlyvegetarian       3444
mostlyother            1007
strictlyvegetarian      875
vegetarian              667
strictlyother           452
mostlyvegan             338
other                   331
strictlyvegan           228
vegan                   137
mostlykosher             86
mostlyhalal              48
strictlykosher           18
strictlyhalal            18
halal                    11
kosher                   11
Name: diet, dtype: int64

In [67]:
ok_cupid_df['diet'] = ok_cupid_df['diet'].str.replace('other', 'otherdiet')
#ok_cupid_df['diet'] = ok_cupid_df['diet'].str.replace(' ', '')
ok_cupid_df['diet'].value_counts()

unknowndiet           24390
mostlyanything        16585
anything               6183
strictlyanything       5114
mostlyvegetarian       3444
mostlyotherdiet        1007
strictlyvegetarian      875
vegetarian              667
strictlyotherdiet       452
mostlyvegan             338
otherdiet               331
strictlyvegan           228
vegan                   137
mostlykosher             86
mostlyhalal              48
strictlykosher           18
strictlyhalal            18
kosher                   11
halal                    11
Name: diet, dtype: int64

In [68]:
diet = CountVectorizer()

diet_transformed = diet.fit_transform(ok_cupid_df["diet"])
diet_transformed

<59943x19 sparse matrix of type '<class 'numpy.int64'>'
	with 59943 stored elements in Compressed Sparse Row format>

In [69]:
diet_df = pd.DataFrame(columns=diet.get_feature_names(), data=diet_transformed.toarray())

# Drop one column to prevent redundant information
diet_df = diet_df.drop(columns=['unknowndiet'])
diet_df.head(2)

Unnamed: 0,anything,halal,kosher,mostlyanything,mostlyhalal,mostlykosher,mostlyotherdiet,mostlyvegan,mostlyvegetarian,otherdiet,strictlyanything,strictlyhalal,strictlykosher,strictlyotherdiet,strictlyvegan,strictlyvegetarian,vegan,vegetarian
0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [70]:
diff_diets = diet_df.columns.tolist()

for d in diff_diets:
    if d.startswith('mostly'):
        diff_diets.remove(d)

for d in diff_diets:
    if d.startswith('strictly'):
        diff_diets.remove(d)

In [71]:
diets = diff_diets

diets_mostly = []
for diet in diets:
    diets_mostly.append('mostly'+diet)
    
diets_strictly = []
for diet in diets:
    diets_strictly.append('strictly'+diet)

In [72]:
for diet in diets:
    if diet in diet_df.columns:
        diet_df.loc[diet_df[diet] == 1, diet] = 2
        
for diet, diet_mostly in zip(diets, diets_mostly):
    if diet_mostly in diet_df.columns:
        diet_df.loc[diet_df[diet_mostly] == 1, diet] = 1
        diet_df = diet_df.drop(columns=diet_mostly)

for diet, diet_strictly in zip(diets, diets_strictly):
    if diet_strictly in diet_df.columns:
        diet_df.loc[diet_df[diet_strictly] == 1, diet] = 3
        diet_df = diet_df.drop(columns=diet_strictly)

In [73]:
diet_df.head(2)

Unnamed: 0,anything,halal,kosher,otherdiet,vegan,vegetarian
0,3,0,0,0,0,0
1,0,0,0,0,0,0


In [74]:
diet_df.rename(columns = {'diet':'otherdiet'}, inplace = True)

In [75]:
# Join back to ok_cupid_df
ok_cupid_df = pd.concat([ok_cupid_df, diet_df], axis=1)
ok_cupid_df.drop(columns='diet', inplace=True)
ok_cupid_df.head(2)

Unnamed: 0,age,drugs,education,height,income,last_online,location,offspring,pets,religion,...,other_ethnicity,white,body_type,drinks,anything,halal,kosher,otherdiet,vegan,vegetarian
0,22,never,working on college/university,75.0,-1,2012-06-28-20-30,"south san francisco, california","doesn't have kids, but might want them",likes dogs and likes cats,agnosticism and very serious about it,...,0,1,1,2,3,0,0,0,0,0
1,35,sometimes,working on space camp,70.0,80000,2012-06-29-21-41,"oakland, california","doesn't have kids, but might want them",likes dogs and likes cats,agnosticism but not too serious about it,...,0,1,2,4,0,0,0,0,0,0


In [76]:
ok_cupid_df['drugs'].value_counts()

never            37723
unknown_drugs    14078
sometimes         7732
often              410
Name: drugs, dtype: int64

In [77]:
drugs_mapping = {'often':3, 'sometimes':2, 'unknown_drugs':1, 'never':0}
drugs_mapped_data = ok_cupid_df['drugs'].map(drugs_mapping)
drugs_mapped_data = drugs_mapped_data.astype(int)

In [78]:
ok_cupid_df = ok_cupid_df.drop(columns=['drugs'])
ok_cupid_df = pd.concat([ok_cupid_df, drugs_mapped_data], axis=1)
ok_cupid_df.head(2)

Unnamed: 0,age,education,height,income,last_online,location,offspring,pets,religion,sign,...,white,body_type,drinks,anything,halal,kosher,otherdiet,vegan,vegetarian,drugs
0,22,working on college/university,75.0,-1,2012-06-28-20-30,"south san francisco, california","doesn't have kids, but might want them",likes dogs and likes cats,agnosticism and very serious about it,gemini,...,1,1,2,3,0,0,0,0,0,0
1,35,working on space camp,70.0,80000,2012-06-29-21-41,"oakland, california","doesn't have kids, but might want them",likes dogs and likes cats,agnosticism but not too serious about it,cancer,...,1,2,4,0,0,0,0,0,0,2


In [79]:
ok_cupid_df['education'].value_counts()

graduated from college/university    23959
graduated from masters program        8961
unknown_education                     6625
working on college/university         5712
working on masters program            1683
graduated from two-year college       1531
graduated from high school            1428
graduated from ph.d program           1272
graduated from law school             1122
working on two-year college           1074
dropped out of college/university      995
working on ph.d program                983
college/university                     801
graduated from space camp              657
dropped out of space camp              523
graduated from med school              446
working on space camp                  445
working on law school                  269
two-year college                       222
working on med school                  212
dropped out of two-year college        191
dropped out of masters program         140
masters program                        136
dropped out

In [80]:
education_mapping = {'dropped out of space camp':0, 'working on space camp':1, 'graduated from space camp':2,
                     'space camp': 2,
                     'dropped out of high school':3, 'working on high school': 4, 'graduated from high school': 5,
                     'high school': 5,
                     'dropped out of two-year college': 6, 'working on two-year college': 7,
                     'two-year college': 8,
                     'graduated from two-year college': 8, 'dropped out of college/university': 9,
                     'college/university': 11,
                     'working on college/university': 10, 'graduated from college/university': 11, 'unknown_education': 12,
                     'dropped out of masters program': 13, 'working on masters program': 14,
                     'masters program': 15,
                     'graduated from masters program': 15, 'dropped out of law school': 16,
                     'working on law school': 17, 'graduated from law school': 18,
                     'law school': 18,
                     'dropped out of ph.d program': 16, 'working on ph.d program': 17,
                     'ph.d program': 18,
                     'graduated from ph.d program': 18, 'dropped out of med school': 16, 
                     'working on med school': 17, 'graduated from med school': 18, 'med school': 18}
education_mapped_data = ok_cupid_df['education'].map(education_mapping)
education_mapped_data = education_mapped_data.astype(int)

In [81]:
ok_cupid_df = ok_cupid_df.drop(columns=['education'])
ok_cupid_df = pd.concat([ok_cupid_df, education_mapped_data], axis=1)
ok_cupid_df.head(2)

Unnamed: 0,age,height,income,last_online,location,offspring,pets,religion,sign,smokes,...,body_type,drinks,anything,halal,kosher,otherdiet,vegan,vegetarian,drugs,education
0,22,75.0,-1,2012-06-28-20-30,"south san francisco, california","doesn't have kids, but might want them",likes dogs and likes cats,agnosticism and very serious about it,gemini,sometimes,...,1,2,3,0,0,0,0,0,0,10
1,35,70.0,80000,2012-06-29-21-41,"oakland, california","doesn't have kids, but might want them",likes dogs and likes cats,agnosticism but not too serious about it,cancer,no,...,2,4,0,0,0,0,0,0,2,1


In [82]:
ok_cupid_df['smokes'].value_counts()

no                43895
unknown_smokes     5510
sometimes          3787
when drinking      3040
yes                2231
trying to quit     1480
Name: smokes, dtype: int64

In [83]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder # This is used for multiple columns

# Instantiate the label encoder
le = LabelEncoder()

# Fit and transform the order priority column
le.fit_transform(ok_cupid_df['smokes'])

array([1, 0, 0, ..., 0, 2, 1])

In [84]:
le.classes_

array(['no', 'sometimes', 'trying to quit', 'unknown_smokes',
       'when drinking', 'yes'], dtype=object)

In [85]:
smokes_mapping = {'yes':4, 'sometimes':3, 'when drinking':3, 'unknown_smokes':2, 'trying to quit':1, 'no': 0}
smokes_mapped_data = ok_cupid_df['smokes'].map(smokes_mapping)
smokes_mapped_data = smokes_mapped_data.astype(int)

In [86]:
ok_cupid_df = ok_cupid_df.drop(columns=['smokes'])
ok_cupid_df = pd.concat([ok_cupid_df, smokes_mapped_data], axis=1)
ok_cupid_df.head(2)

Unnamed: 0,age,height,income,last_online,location,offspring,pets,religion,sign,speaks,...,drinks,anything,halal,kosher,otherdiet,vegan,vegetarian,drugs,education,smokes
0,22,75.0,-1,2012-06-28-20-30,"south san francisco, california","doesn't have kids, but might want them",likes dogs and likes cats,agnosticism and very serious about it,gemini,english,...,2,3,0,0,0,0,0,0,10,3
1,35,70.0,80000,2012-06-29-21-41,"oakland, california","doesn't have kids, but might want them",likes dogs and likes cats,agnosticism but not too serious about it,cancer,"english (fluently), spanish (poorly), french (...",...,4,0,0,0,0,0,0,2,1,0


In [87]:
ok_cupid_df['location'].str.split(',', expand=True).head(2)

Unnamed: 0,0,1,2
0,south san francisco,california,
1,oakland,california,


In [88]:
ok_cupid_df['location'].str.split(',', expand=True)[1].unique()

array([' california', ' colorado', ' new york', ' oregon', ' arizona',
       ' hawaii', ' montana', ' wisconsin', ' virginia', ' spain',
       ' nevada', ' illinois', ' vietnam', ' ireland', ' louisiana',
       ' michigan', ' texas', ' united kingdom', ' massachusetts',
       ' north carolina', ' idaho', ' mississippi', ' new jersey',
       ' florida', ' minnesota', ' georgia', ' utah', ' washington',
       ' west virginia', ' connecticut', ' tennessee', ' rhode island',
       ' district of columbia', ' british columbia', ' missouri',
       ' germany', ' pennsylvania', ' netherlands', ' switzerland',
       ' ohio'], dtype=object)

In [89]:
ok_cupid_df['city'] = ok_cupid_df['location'].str.split(',', expand=True)[0]

In [90]:
ok_cupid_df['state'] = ok_cupid_df['location'].str.split(',', expand=True)[1]
ok_cupid_df['state'] = ok_cupid_df['state'].str.lstrip(' ')
ok_cupid_df['state'].unique()

array(['california', 'colorado', 'new york', 'oregon', 'arizona',
       'hawaii', 'montana', 'wisconsin', 'virginia', 'spain', 'nevada',
       'illinois', 'vietnam', 'ireland', 'louisiana', 'michigan', 'texas',
       'united kingdom', 'massachusetts', 'north carolina', 'idaho',
       'mississippi', 'new jersey', 'florida', 'minnesota', 'georgia',
       'utah', 'washington', 'west virginia', 'connecticut', 'tennessee',
       'rhode island', 'district of columbia', 'british columbia',
       'missouri', 'germany', 'pennsylvania', 'netherlands',
       'switzerland', 'ohio'], dtype=object)

In [91]:
ok_cupid_df['country'] = 'united states'

In [92]:
ok_cupid_df.loc[ok_cupid_df['state'] == 'spain', 'state'] = 'madrid'
ok_cupid_df.loc[ok_cupid_df['city'] == 'madrid', 'country'] = 'spain'

ok_cupid_df.loc[ok_cupid_df['state'] == 'vietnam', 'state'] = 'khanh hoa'
ok_cupid_df.loc[ok_cupid_df['city'] == 'nha trang', 'country'] = 'vietnam'

ok_cupid_df.loc[ok_cupid_df['state'] == 'ireland', 'state'] = 'munster'
ok_cupid_df.loc[ok_cupid_df['city'] == 'cork', 'country'] = 'ireland'

ok_cupid_df.loc[ok_cupid_df['city'] == 'edinburgh', 'state'] = 'scotland'
ok_cupid_df.loc[ok_cupid_df['city'] == 'edinburgh', 'country'] = 'united kingdom'

ok_cupid_df.loc[ok_cupid_df['city'] == 'london', 'state'] = 'england'
ok_cupid_df.loc[ok_cupid_df['city'] == 'london', 'country'] = 'united kingdom'

ok_cupid_df.loc[ok_cupid_df['state'] == 'germany', 'state'] = 'hessen'
ok_cupid_df.loc[ok_cupid_df['city'] == 'kassel', 'country'] = 'germany'

ok_cupid_df.loc[ok_cupid_df['state'] == 'netherlands', 'state'] = 'north holland'
ok_cupid_df.loc[ok_cupid_df['city'] == 'amsterdam', 'country'] = 'netherlands'

ok_cupid_df.loc[ok_cupid_df['state'] == 'switzerland', 'state'] = 'graubunden'
ok_cupid_df.loc[ok_cupid_df['city'] == 'bonaduz', 'country'] = 'switzerland'

ok_cupid_df.loc[ok_cupid_df['city'] == 'vancouver', 'country'] = 'canada'

ok_cupid_df = ok_cupid_df.drop(columns=['location'])

In [93]:
ok_cupid_df[ok_cupid_df['country'] == 'canada']

Unnamed: 0,age,height,income,last_online,offspring,pets,religion,sign,speaks,essay0,...,kosher,otherdiet,vegan,vegetarian,drugs,education,smokes,city,state,country
42437,32,63.0,60000,2012-06-28-18-38,doesn't have kids,unknown_pets,other,aquarius,"english (fluently), chinese (poorly), french (...",897,...,0,0,0,0,0,1,0,vancouver,british columbia,canada


In [94]:
ok_cupid_df['state'].unique()

array(['california', 'colorado', 'new york', 'oregon', 'arizona',
       'hawaii', 'montana', 'wisconsin', 'virginia', 'madrid', 'nevada',
       'illinois', 'khanh hoa', 'munster', 'louisiana', 'michigan',
       'texas', 'scotland', 'england', 'massachusetts', 'north carolina',
       'idaho', 'mississippi', 'new jersey', 'florida', 'minnesota',
       'georgia', 'utah', 'washington', 'west virginia', 'connecticut',
       'tennessee', 'rhode island', 'district of columbia',
       'british columbia', 'missouri', 'hessen', 'pennsylvania',
       'north holland', 'graubunden', 'ohio'], dtype=object)

In [95]:
ok_cupid_df['country'].unique()

array(['united states', 'spain', 'vietnam', 'ireland', 'united kingdom',
       'canada', 'germany', 'netherlands', 'switzerland'], dtype=object)

In [96]:
# Instantiate the OneHotEncoder
city_ohe = OneHotEncoder()
state_ohe = OneHotEncoder()
country_ohe = OneHotEncoder()

# Fit the OneHotEncoder to the subcategory column and transform
# Expects a 2D array
city = pd.DataFrame(ok_cupid_df['city'])
city_encoded = city_ohe.fit_transform(city)
display(city_encoded)

state = pd.DataFrame(ok_cupid_df['state'])
state_encoded = state_ohe.fit_transform(state)
display(state_encoded)

country = pd.DataFrame(ok_cupid_df['country'])
country_encoded = country_ohe.fit_transform(country)
display(country_encoded)

<59943x197 sparse matrix of type '<class 'numpy.float64'>'
	with 59943 stored elements in Compressed Sparse Row format>

<59943x41 sparse matrix of type '<class 'numpy.float64'>'
	with 59943 stored elements in Compressed Sparse Row format>

<59943x9 sparse matrix of type '<class 'numpy.float64'>'
	with 59943 stored elements in Compressed Sparse Row format>

In [97]:
# Put into a dataframe to get column names
encoded_df_city = pd.DataFrame(city_encoded.toarray().astype(int), columns=city_ohe.categories_[0], dtype=int)
encoded_df_city = encoded_df_city.drop(encoded_df_city.columns[0], axis=1)
display(encoded_df_city.head(2))

# Status
encoded_df_state = pd.DataFrame(state_encoded.toarray().astype(int), columns=state_ohe.categories_[0], dtype=int)
encoded_df_state = encoded_df_state.drop(encoded_df_state.columns[0], axis=1)
display(encoded_df_state.head(2))

# Jobs
encoded_df_country = pd.DataFrame(country_encoded.toarray().astype(int), columns=country_ohe.categories_[0], dtype=int)
encoded_df_country = encoded_df_country.drop(encoded_df_country.columns[0], axis=1)
display(encoded_df_country.head(2))

Unnamed: 0,albany,amsterdam,arcadia,asheville,ashland,astoria,atherton,atlanta,austin,bayshore,...,vallejo,vancouver,walnut creek,washington,waterford,west oakland,westlake,woodacre,woodbridge,woodside
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,british columbia,california,colorado,connecticut,district of columbia,england,florida,georgia,graubunden,hawaii,...,pennsylvania,rhode island,scotland,tennessee,texas,utah,virginia,washington,west virginia,wisconsin
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,germany,ireland,netherlands,spain,switzerland,united kingdom,united states,vietnam
0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,1,0


In [98]:
# Join back to ok_cupid_df
ok_cupid_df = pd.concat([ok_cupid_df, encoded_df_city], axis=1)
ok_cupid_df.drop(columns='city', inplace=True)

ok_cupid_df = pd.concat([ok_cupid_df, encoded_df_state], axis=1)
ok_cupid_df.drop(columns='state', inplace=True)

ok_cupid_df = pd.concat([ok_cupid_df, encoded_df_country], axis=1)
ok_cupid_df.drop(columns='country', inplace=True)

ok_cupid_df.head(2)

Unnamed: 0,age,height,income,last_online,offspring,pets,religion,sign,speaks,essay0,...,west virginia,wisconsin,germany,ireland,netherlands,spain,switzerland,united kingdom,united states,vietnam
0,22,75.0,-1,2012-06-28-20-30,"doesn't have kids, but might want them",likes dogs and likes cats,agnosticism and very serious about it,gemini,english,1251,...,0,0,0,0,0,0,0,0,1,0
1,35,70.0,80000,2012-06-29-21-41,"doesn't have kids, but might want them",likes dogs and likes cats,agnosticism but not too serious about it,cancer,"english (fluently), spanish (poorly), french (...",661,...,0,0,0,0,0,0,0,0,1,0


In [99]:
ok_cupid_df['offspring'].value_counts()

unknown_offspring                          35559
doesn't have kids                           7559
doesn't have kids, but might want them      3875
doesn't have kids, but wants them           3565
doesn't want kids                           2927
has kids                                    1883
has a kid                                   1881
doesn't have kids, and doesn't want any     1132
has kids, but doesn't want more              442
has a kid, but doesn't want more             275
has a kid, and might want more               231
wants kids                                   225
might want kids                              182
has kids, and might want more                115
has a kid, and wants more                     71
has kids, and wants more                      21
Name: offspring, dtype: int64

In [100]:
ok_cupid_df['offspring'] = ok_cupid_df['offspring'].replace({"doesn't have kids": "doesnt_have_kids",
                                                             "doesn't have kids, but might want them": "doesnt_have_kids might_want_kids",
                                                             "doesn't have kids, but wants them": "doesnt_have_kids wants_kids",
                                                             "doesn't want kids": "doesnt_want_kids",
                                                             "has kids": "has_kids",
                                                             "has a kid": "has_a_kid",
                                                             "doesn't have kids, and doesn't want any": "doesnt_have_kids doesnt_want_kids",
                                                             "has kids, but doesn't want more": "has_kids doesnt_want_kids",
                                                             "has a kid, but doesn't want more": "has_a_kid doesnt_want_kids",
                                                             "has a kid, and might want more": "has_a_kid might_want_kids",
                                                             "wants kids": "wants_kids",
                                                             "might want kids": "might_want_kids",
                                                             "has kids, and might want more": "has_kids might_want_kids",
                                                             "has a kid, and wants more": "has_a_kid wants_kids",
                                                             "has kids, and wants more": "has_kids wants_kids"})

ok_cupid_df['offspring'].value_counts()

unknown_offspring                    35559
doesnt_have_kids                      7559
doesnt_have_kids might_want_kids      3875
doesnt_have_kids wants_kids           3565
doesnt_want_kids                      2927
has_kids                              1883
has_a_kid                             1881
doesnt_have_kids doesnt_want_kids     1132
has_kids doesnt_want_kids              442
has_a_kid doesnt_want_kids             275
has_a_kid might_want_kids              231
wants_kids                             225
might_want_kids                        182
has_kids might_want_kids               115
has_a_kid wants_kids                    71
has_kids wants_kids                     21
Name: offspring, dtype: int64

In [101]:
# 1. Instantiate 
offspring = CountVectorizer()

# 2. Fit 
offspring.fit(ok_cupid_df["offspring"])

# 3. Transform
offspring_transformed = offspring.transform(ok_cupid_df["offspring"])
offspring_transformed

<59943x7 sparse matrix of type '<class 'numpy.int64'>'
	with 69670 stored elements in Compressed Sparse Row format>

In [102]:
offspring_df = pd.DataFrame(columns=offspring.get_feature_names(), data=offspring_transformed.toarray())

# Drop one column to prevent redundant information
offspring_df = offspring_df.drop(columns='unknown_offspring')
offspring_df.head(2)

Unnamed: 0,doesnt_have_kids,doesnt_want_kids,has_a_kid,has_kids,might_want_kids,wants_kids
0,1,0,0,0,1,0
1,1,0,0,0,1,0


In [103]:
# Join back to ok_cupid_df
ok_cupid_df = pd.concat([ok_cupid_df, offspring_df], axis=1)
ok_cupid_df.drop(columns='offspring', inplace=True)
ok_cupid_df.head(2)

Unnamed: 0,age,height,income,last_online,pets,religion,sign,speaks,essay0,essay1,...,switzerland,united kingdom,united states,vietnam,doesnt_have_kids,doesnt_want_kids,has_a_kid,has_kids,might_want_kids,wants_kids
0,22,75.0,-1,2012-06-28-20-30,likes dogs and likes cats,agnosticism and very serious about it,gemini,english,1251,253,...,0,0,1,0,1,0,0,0,1,0
1,35,70.0,80000,2012-06-29-21-41,likes dogs and likes cats,agnosticism but not too serious about it,cancer,"english (fluently), spanish (poorly), french (...",661,52,...,0,0,1,0,1,0,0,0,1,0


In [104]:
ok_cupid_df['pets'].value_counts()

unknown_pets                       19919
likes dogs and likes cats          14814
likes dogs                          7224
likes dogs and has cats             4313
has dogs                            4134
has dogs and likes cats             2333
likes dogs and dislikes cats        2029
has dogs and has cats               1474
has cats                            1406
likes cats                          1062
has dogs and dislikes cats           552
dislikes dogs and likes cats         240
dislikes dogs and dislikes cats      196
dislikes cats                        122
dislikes dogs and has cats            81
dislikes dogs                         44
Name: pets, dtype: int64

In [105]:
ok_cupid_df['pets'] = ok_cupid_df['pets'].replace({'likes dogs and likes cats': 'likes_dogs likes_cats',
                                                   'likes dogs': 'likes_dogs',
                                                   'likes dogs and has cats': 'likes_dogs has_cats',
                                                   'has dogs': 'has_dogs',
                                                   'has dogs and likes cats': 'has_dogs likes_cats',
                                                   'likes dogs and dislikes cats': 'likes_dogs dislikes_cats',
                                                   'has dogs and has cats': 'has_dogs has_cats',
                                                   'has cats': 'has_cats',
                                                   'likes cats': 'likes_cats',
                                                   'has dogs and dislikes cats': 'has_dogs dislikes_cats',
                                                   'dislikes dogs and likes cats': 'dislikes_dogs likes_cats',
                                                   'dislikes dogs and dislikes cats': 'dislikes_dogs dislikes_cats',
                                                   'dislikes cats': 'dislikes_cats',
                                                   'dislikes dogs and has cats': 'dislikes_dogs has_cats',
                                                   'dislikes dogs': 'dislikes_dogs'})

ok_cupid_df['pets'].value_counts()

unknown_pets                   19919
likes_dogs likes_cats          14814
likes_dogs                      7224
likes_dogs has_cats             4313
has_dogs                        4134
has_dogs likes_cats             2333
likes_dogs dislikes_cats        2029
has_dogs has_cats               1474
has_cats                        1406
likes_cats                      1062
has_dogs dislikes_cats           552
dislikes_dogs likes_cats         240
dislikes_dogs dislikes_cats      196
dislikes_cats                    122
dislikes_dogs has_cats            81
dislikes_dogs                     44
Name: pets, dtype: int64

In [106]:
# 1. Instantiate 
pets = CountVectorizer()

# 2. Fit 
pets.fit(ok_cupid_df["pets"])

# 3. Transform
pets_transformed = pets.transform(ok_cupid_df["pets"])
pets_transformed

<59943x7 sparse matrix of type '<class 'numpy.int64'>'
	with 85975 stored elements in Compressed Sparse Row format>

In [107]:
pets_df = pd.DataFrame(columns=pets.get_feature_names(), data=pets_transformed.toarray())

# Drop one column to prevent redundant information
pets_df = pets_df.drop(columns='unknown_pets')

pets_df.head(2)

Unnamed: 0,dislikes_cats,dislikes_dogs,has_cats,has_dogs,likes_cats,likes_dogs
0,0,0,0,0,1,1
1,0,0,0,0,1,1


In [108]:
# Join back to ok_cupid_df
ok_cupid_df = pd.concat([ok_cupid_df, pets_df], axis=1)
ok_cupid_df.drop(columns='pets', inplace=True)
ok_cupid_df.head(2)

Unnamed: 0,age,height,income,last_online,religion,sign,speaks,essay0,essay1,essay2,...,has_a_kid,has_kids,might_want_kids,wants_kids,dislikes_cats,dislikes_dogs,has_cats,has_dogs,likes_cats,likes_dogs
0,22,75.0,-1,2012-06-28-20-30,agnosticism and very serious about it,gemini,english,1251,253,114,...,0,0,1,0,0,0,0,0,1,1
1,35,70.0,80000,2012-06-29-21-41,agnosticism but not too serious about it,cancer,"english (fluently), spanish (poorly), french (...",661,52,96,...,0,0,1,0,0,0,0,0,1,1


In [109]:
ok_cupid_df['religion'].value_counts()

unknown_religion                              20223
agnosticism                                    2724
other                                          2691
agnosticism but not too serious about it       2636
agnosticism and laughing about it              2496
catholicism but not too serious about it       2318
atheism                                        2175
other and laughing about it                    2119
atheism and laughing about it                  2074
christianity                                   1957
christianity but not too serious about it      1952
other but not too serious about it             1554
judaism but not too serious about it           1517
atheism but not too serious about it           1318
catholicism                                    1064
christianity and somewhat serious about it      927
atheism and somewhat serious about it           848
other and somewhat serious about it             846
catholicism and laughing about it               726
judaism and 

In [110]:
ok_cupid_df['religion'] = ok_cupid_df['religion'].str.replace(' ', '')
ok_cupid_df['religion'] = ok_cupid_df['religion'].str.replace('other', 'otherreligion')
ok_cupid_df['religion'].value_counts()

unknown_religion                          20223
agnosticism                                2724
otherreligion                              2691
agnosticismbutnottooseriousaboutit         2636
agnosticismandlaughingaboutit              2496
catholicismbutnottooseriousaboutit         2318
atheism                                    2175
otherreligionandlaughingaboutit            2119
atheismandlaughingaboutit                  2074
christianity                               1957
christianitybutnottooseriousaboutit        1952
otherreligionbutnottooseriousaboutit       1554
judaismbutnottooseriousaboutit             1517
atheismbutnottooseriousaboutit             1318
catholicism                                1064
christianityandsomewhatseriousaboutit       927
atheismandsomewhatseriousaboutit            848
otherreligionandsomewhatseriousaboutit      846
catholicismandlaughingaboutit               726
judaismandlaughingaboutit                   681
buddhismbutnottooseriousaboutit         

In [111]:
religion = CountVectorizer()

religion_transformed = religion.fit_transform(ok_cupid_df["religion"])
religion_transformed

<59943x46 sparse matrix of type '<class 'numpy.int64'>'
	with 59943 stored elements in Compressed Sparse Row format>

In [112]:
religion_df = pd.DataFrame(columns=religion.get_feature_names(), data=religion_transformed.toarray())

# Drop one column to prevent redundant information
religion_df = religion_df.drop(columns=['unknown_religion'])
religion_df.head(2)

Unnamed: 0,agnosticism,agnosticismandlaughingaboutit,agnosticismandsomewhatseriousaboutit,agnosticismandveryseriousaboutit,agnosticismbutnottooseriousaboutit,atheism,atheismandlaughingaboutit,atheismandsomewhatseriousaboutit,atheismandveryseriousaboutit,atheismbutnottooseriousaboutit,...,judaism,judaismandlaughingaboutit,judaismandsomewhatseriousaboutit,judaismandveryseriousaboutit,judaismbutnottooseriousaboutit,otherreligion,otherreligionandlaughingaboutit,otherreligionandsomewhatseriousaboutit,otherreligionandveryseriousaboutit,otherreligionbutnottooseriousaboutit
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [113]:
rel = religion_df.columns.tolist()

for r in rel:
    if r.endswith('andveryseriousaboutit'):
        rel.remove(r)

for r in rel:
    if r.endswith('andlaughingaboutit'):
        rel.remove(r)    
        
for r in rel:
    if r.endswith('andsomewhatseriousaboutit'):
        rel.remove(r)
        
for r in rel:
    if r.endswith('butnottooseriousaboutit'):
        rel.remove(r)

In [114]:
religions = rel

religions_serious = []
for religion in religions:
    religions_serious.append(religion+'andveryseriousaboutit')
    
religions_laughing = []
for religion in religions:
    religions_laughing.append(religion+'andlaughingaboutit')
    
religions_somewhat = []
for religion in religions:
    religions_somewhat.append(religion+'andsomewhatseriousaboutit')
    
religions_not_serious = []
for religion in religions:
    religions_not_serious.append(religion+'butnottooseriousaboutit')

In [115]:
for religion in religions:
    if religion in religion_df.columns:
        religion_df.loc[religion_df[religion] == 1, religion] = 4
        
for religion, religion_serious in zip(religions, religions_serious):
    if religion_serious in religion_df.columns:
        religion_df.loc[religion_df[religion_serious] == 1, religion] = 4
        religion_df = religion_df.drop(columns=religion_serious)

for religion, religion_laughing in zip(religions, religions_laughing):
    if religion_laughing in religion_df.columns:
        religion_df.loc[religion_df[religion_laughing] == 1, religion] = 1
        religion_df = religion_df.drop(columns=religion_laughing)
        
for religion, religion_somewhat in zip(religions, religions_somewhat):
    if religion_somewhat in religion_df.columns:
        religion_df.loc[religion_df[religion_somewhat] == 1, religion] = 3
        religion_df = religion_df.drop(columns=religion_somewhat)
        
for religion, religion_not_serious in zip(religions, religions_not_serious):
    if religion_not_serious in religion_df.columns:
        religion_df.loc[religion_df[religion_not_serious] == 1, religion] = 2
        religion_df = religion_df.drop(columns=religion_not_serious)

In [116]:
religion_df.head(2)

Unnamed: 0,agnosticism,atheism,buddhism,catholicism,christianity,hinduism,islam,judaism,otherreligion
0,4,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0


In [117]:
# Join back to ok_cupid_df
ok_cupid_df = pd.concat([ok_cupid_df, religion_df], axis=1)
ok_cupid_df.drop(columns='religion', inplace=True)
ok_cupid_df.head(2)

Unnamed: 0,age,height,income,last_online,sign,speaks,essay0,essay1,essay2,essay3,...,likes_dogs,agnosticism,atheism,buddhism,catholicism,christianity,hinduism,islam,judaism,otherreligion
0,22,75.0,-1,2012-06-28-20-30,gemini,english,1251,253,114,124,...,1,4,0,0,0,0,0,0,0,0
1,35,70.0,80000,2012-06-29-21-41,cancer,"english (fluently), spanish (poorly), french (...",661,52,96,0,...,1,2,0,0,0,0,0,0,0,0


In [118]:
ok_cupid_df['sign'] = ok_cupid_df['sign'].str.replace('&', '').str.replace('rsquo;','')
ok_cupid_df['sign'].value_counts()

unknown_sign                              11054
gemini and its fun to think about          1782
scorpio and its fun to think about         1772
leo and its fun to think about             1692
libra and its fun to think about           1649
taurus and its fun to think about          1640
cancer and its fun to think about          1597
pisces and its fun to think about          1592
sagittarius and its fun to think about     1583
virgo and its fun to think about           1574
aries and its fun to think about           1573
aquarius and its fun to think about        1503
virgo but it doesnt matter                 1497
leo but it doesnt matter                   1457
cancer but it doesnt matter                1454
gemini but it doesnt matter                1453
taurus but it doesnt matter                1450
libra but it doesnt matter                 1408
aquarius but it doesnt matter              1408
capricorn and its fun to think about       1376
sagittarius but it doesnt matter        

In [119]:
ok_cupid_df['sign'] = ok_cupid_df['sign'].str.replace(' ', '')
ok_cupid_df['sign'].value_counts()

unknown_sign                        11054
geminianditsfuntothinkabout          1782
scorpioanditsfuntothinkabout         1772
leoanditsfuntothinkabout             1692
libraanditsfuntothinkabout           1649
taurusanditsfuntothinkabout          1640
canceranditsfuntothinkabout          1597
piscesanditsfuntothinkabout          1592
sagittariusanditsfuntothinkabout     1583
virgoanditsfuntothinkabout           1574
ariesanditsfuntothinkabout           1573
aquariusanditsfuntothinkabout        1503
virgobutitdoesntmatter               1497
leobutitdoesntmatter                 1457
cancerbutitdoesntmatter              1454
geminibutitdoesntmatter              1453
taurusbutitdoesntmatter              1450
aquariusbutitdoesntmatter            1408
librabutitdoesntmatter               1408
capricornanditsfuntothinkabout       1376
sagittariusbutitdoesntmatter         1375
ariesbutitdoesntmatter               1373
capricornbutitdoesntmatter           1319
piscesbutitdoesntmatter           

In [120]:
sign = CountVectorizer()

sign_transformed = sign.fit_transform(ok_cupid_df["sign"])
sign_transformed

<59943x49 sparse matrix of type '<class 'numpy.int64'>'
	with 59943 stored elements in Compressed Sparse Row format>

In [121]:
sign_df = pd.DataFrame(columns=sign.get_feature_names(), data=sign_transformed.toarray())

# Drop one column to prevent redundant information
sign_df = sign_df.drop(columns=['unknown_sign'])
sign_df.head(2)

Unnamed: 0,aquarius,aquariusanditmattersalot,aquariusanditsfuntothinkabout,aquariusbutitdoesntmatter,aries,ariesanditmattersalot,ariesanditsfuntothinkabout,ariesbutitdoesntmatter,cancer,canceranditmattersalot,...,scorpioanditsfuntothinkabout,scorpiobutitdoesntmatter,taurus,taurusanditmattersalot,taurusanditsfuntothinkabout,taurusbutitdoesntmatter,virgo,virgoanditmattersalot,virgoanditsfuntothinkabout,virgobutitdoesntmatter
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [122]:
sig = sign_df.columns.tolist()

for s in sig:
    if s.endswith('butitdoesntmatter'):
        sig.remove(s)

for s in sig:
    if s.endswith('anditmattersalot'):
        sig.remove(s)    
        
for s in sig:
    if s.endswith('anditsfuntothinkabout'):
        sig.remove(s)

In [123]:
signs = sig

signs_doesntmatter = []
for sign in signs:
    signs_doesntmatter.append(sign+'butitdoesntmatter')
    
signs_matters = []
for sign in signs:
    signs_matters.append(sign+'anditmattersalot')
    
signs_fun = []
for sign in signs:
    signs_fun.append(sign+'anditsfuntothinkabout')

In [124]:
for sign in signs:
    if sign in sign_df.columns:
        sign_df.loc[sign_df[sign] == 1, sign] = 3
        
for sign, sign_doesntmatter in zip(signs, signs_doesntmatter):
    if sign_doesntmatter in sign_df.columns:
        sign_df.loc[sign_df[sign_doesntmatter] == 1, sign] = 2
        sign_df = sign_df.drop(columns=sign_doesntmatter)

for sign, sign_matters in zip(signs, signs_matters):
    if sign_matters in sign_df.columns:
        sign_df.loc[sign_df[sign_matters] == 1, sign] = 3
        sign_df = sign_df.drop(columns=sign_matters)
        
for sign, sign_fun in zip(signs, signs_fun):
    if sign_fun in sign_df.columns:
        sign_df.loc[sign_df[sign_fun] == 1, sign] = 1
        sign_df = sign_df.drop(columns=sign_fun)

In [125]:
sign_df.head(2)

Unnamed: 0,aquarius,aries,cancer,capricorn,gemini,leo,libra,pisces,sagittarius,scorpio,taurus,virgo
0,0,0,0,0,3,0,0,0,0,0,0,0
1,0,0,3,0,0,0,0,0,0,0,0,0


In [126]:
# Join back to ok_cupid_df
ok_cupid_df = pd.concat([ok_cupid_df, sign_df], axis=1)
ok_cupid_df.drop(columns='sign', inplace=True)
ok_cupid_df.head(2)

Unnamed: 0,age,height,income,last_online,speaks,essay0,essay1,essay2,essay3,essay4,...,cancer,capricorn,gemini,leo,libra,pisces,sagittarius,scorpio,taurus,virgo
0,22,75.0,-1,2012-06-28-20-30,english,1251,253,114,124,381,...,0,0,3,0,0,0,0,0,0,0
1,35,70.0,80000,2012-06-29-21-41,"english (fluently), spanish (poorly), french (...",661,52,96,0,249,...,3,0,0,0,0,0,0,0,0,0


In [127]:
ok_cupid_df['speaks'] = ok_cupid_df['speaks'].str.replace(' ', '').str.replace('(', '_').str.replace(')', '').str.replace(',', ' ').str.replace('_', '')
ok_cupid_df['speaks'].value_counts()

english                                                                 21827
englishfluently                                                          6627
englishfluently spanishpoorly                                            2059
englishfluently spanishokay                                              1917
englishfluently spanishfluently                                          1288
                                                                        ...  
englishfluently latinokay ancientgreekpoorly c++fluently lisppoorly         1
english hindipoorly spanishokay otherokay                                   1
englishfluently germanpoorly koreanpoorly c++okay                           1
englishfluently frenchfluently russianokay spanishokay tagalogpoorly        1
english chinesefluently indonesianfluently                                  1
Name: speaks, Length: 7648, dtype: int64

In [128]:
speaks = CountVectorizer()

speaks_transformed = speaks.fit_transform(ok_cupid_df["speaks"])
speaks_transformed

<59943x302 sparse matrix of type '<class 'numpy.int64'>'
	with 110529 stored elements in Compressed Sparse Row format>

In [129]:
speaks_df = pd.DataFrame(columns=speaks.get_feature_names(), data=speaks_transformed.toarray())

# Drop one column to prevent redundant information
speaks_df = speaks_df.drop(columns=['unknownspeaks', 'poorly', 'fluently', 'okay', 'lisp', 'lispokay', 'lisppoorly', 'lispfluently'])
speaks_df.head(2)

Unnamed: 0,afrikaans,afrikaansfluently,afrikaansokay,afrikaanspoorly,albanian,albanianfluently,albanianokay,albanianpoorly,ancientgreek,ancientgreekfluently,...,vietnameseokay,vietnamesepoorly,welsh,welshfluently,welshokay,welshpoorly,yiddish,yiddishfluently,yiddishokay,yiddishpoorly
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [130]:
speaks_df = speaks_df.rename(columns={'armenianfluently': 'armenian', 'slovenianfluently': 'slovenian', 'sardinianfluently': 'sardinian'})

In [131]:
lang = speaks_df.columns.tolist()

for l in lang:
    if l.endswith('fluently'):
        lang.remove(l)

for l in lang:
    if l.endswith('poorly'):
        lang.remove(l)    
        
for l in lang:
    if l.endswith('okay'):
        lang.remove(l)

In [132]:
languages = lang

languages_fluently = []
for language in languages:
    languages_fluently.append(language+'fluently')
    
languages_okay = []
for language in languages:
    languages_okay.append(language+'okay')
    
languages_poorly = []
for language in languages:
    languages_poorly.append(language+'poorly')

In [133]:
for language in languages:
    if language in speaks_df.columns:
        speaks_df.loc[speaks_df[language] == 1, language] = 3
        
for language, language_fluent in zip(languages, languages_fluently):
    if language_fluent in speaks_df.columns:
        speaks_df.loc[speaks_df[language_fluent] == 1, language] = 3
        speaks_df = speaks_df.drop(columns=language_fluent)

for language, language_okay in zip(languages, languages_okay):
    if language_okay in speaks_df.columns:
        speaks_df.loc[speaks_df[language_okay] == 1, language] = 2
        #speaks_df.loc[speaks_df['slovenian'] == 1, 'slovenian'] = 2
        speaks_df = speaks_df.drop(columns=language_okay)
        
for language, language_poorly in zip(languages, languages_poorly):
    if language_poorly in speaks_df.columns:
        speaks_df.loc[speaks_df[language_poorly] == 1, language] = 1
        speaks_df = speaks_df.drop(columns=language_poorly)

In [134]:
speaks_df.head(2)

Unnamed: 0,afrikaans,albanian,ancientgreek,arabic,armenian,basque,belarusan,bengali,breton,bulgarian,...,tagalog,tamil,thai,tibetan,turkish,ukrainian,urdu,vietnamese,welsh,yiddish
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [135]:
# Join back to ok_cupid_df
ok_cupid_df = pd.concat([ok_cupid_df, speaks_df], axis=1)
ok_cupid_df.drop(columns='speaks', inplace=True)
ok_cupid_df.head(2)

Unnamed: 0,age,height,income,last_online,essay0,essay1,essay2,essay3,essay4,essay5,...,tagalog,tamil,thai,tibetan,turkish,ukrainian,urdu,vietnamese,welsh,yiddish
0,22,75.0,-1,2012-06-28-20-30,1251,253,114,124,381,33,...,0,0,0,0,0,0,0,0,0,0
1,35,70.0,80000,2012-06-29-21-41,661,52,96,0,249,276,...,0,0,0,0,0,0,0,0,0,0


In [136]:
ok_cupid_df.drop(columns='income', inplace=True)

In [137]:
ok_cupid_df['last_online']

0        2012-06-28-20-30
1        2012-06-29-21-41
2        2012-06-27-09-10
3        2012-06-28-14-22
4        2012-06-27-21-26
               ...       
59938    2012-06-12-21-47
59939    2012-06-29-11-01
59940    2012-06-27-23-37
59941    2012-06-23-13-01
59942    2012-06-29-00-42
Name: last_online, Length: 59943, dtype: object

In [138]:
ok_cupid_df['last_online_year'] = ok_cupid_df['last_online'].str.split("-", expand=True)[0].astype('int')
ok_cupid_df['last_online_month'] = ok_cupid_df['last_online'].str.split("-", expand=True)[1].astype('int')
last_online_datetime = (ok_cupid_df['last_online'].str.split("-", expand=True)[0]+'-'+ok_cupid_df['last_online'].str.split("-", expand=True)[1]+'-'+ok_cupid_df['last_online'].str.split("-", expand=True)[2]).astype('datetime64')
ok_cupid_df['last_online_weekday'] = last_online_datetime.dt.weekday
ok_cupid_df['last_online_weekday'] = np.where(ok_cupid_df['last_online_weekday'] < 5, 1, 0)
ok_cupid_df['last_online_weekday']

0        1
1        1
2        1
3        1
4        1
        ..
59938    1
59939    1
59940    1
59941    0
59942    1
Name: last_online_weekday, Length: 59943, dtype: int64

In [139]:
#ok_cupid_df['last_online_year'] = ok_cupid_df['last_online'].str.split("-", expand=True)[0]
ok_cupid_df['last_online'].str.split("-", expand=True)[0]+'-'+ok_cupid_df['last_online'].str.split("-", expand=True)[1]+'-'+ok_cupid_df['last_online'].str.split("-", expand=True)[2]

0        2012-06-28
1        2012-06-29
2        2012-06-27
3        2012-06-28
4        2012-06-27
            ...    
59938    2012-06-12
59939    2012-06-29
59940    2012-06-27
59941    2012-06-23
59942    2012-06-29
Length: 59943, dtype: object

In [140]:
ok_cupid_df.drop(columns='last_online', inplace=True)
ok_cupid_df

Unnamed: 0,age,height,essay0,essay1,essay2,essay3,essay4,essay5,essay6,essay7,...,tibetan,turkish,ukrainian,urdu,vietnamese,welsh,yiddish,last_online_year,last_online_month,last_online_weekday
0,22,75.0,1251,253,114,124,381,33,27,78,...,0,0,0,0,0,0,0,2012,6,1
1,35,70.0,661,52,96,0,249,276,0,0,...,0,0,0,0,0,0,0,2012,6,1
2,38,68.0,1392,557,460,258,2274,56,0,59,...,0,0,0,0,0,0,0,2012,6,1
3,23,71.0,41,41,100,33,178,0,26,0,...,0,0,0,0,0,0,0,2012,6,1
4,29,66.0,210,26,89,39,289,0,0,0,...,0,0,0,0,0,0,0,2012,6,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59938,59,62.0,269,326,112,64,525,42,16,84,...,0,0,0,0,0,0,0,2012,6,1
59939,24,72.0,538,75,74,19,1047,63,438,45,...,0,0,0,0,0,0,0,2012,6,1
59940,42,71.0,963,75,116,71,299,89,40,47,...,0,0,0,0,0,0,0,2012,6,1
59941,27,73.0,218,220,9,277,862,61,41,55,...,0,0,0,0,0,0,0,2012,6,0


In [141]:
#ok_cupid_df.columns.tolist()

In [142]:
ok_cupid_df.isna().sum().sum()

0

In [143]:
ok_cupid_df.to_csv (r'data/okcupid_profiles_clean.csv', index = False, header=True)