In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('profiles.csv')
df.head()

Unnamed: 0,age,body_type,diet,drinks,drugs,education,essay0,essay1,essay2,essay3,...,location,offspring,orientation,pets,religion,sex,sign,smokes,speaks,status
0,22,a little extra,strictly anything,socially,never,working on college/university,about me:<br />\n<br />\ni would love to think...,currently working as an international agent fo...,making people laugh.<br />\nranting about a go...,"the way i look. i am a six foot half asian, ha...",...,"south san francisco, california","doesn&rsquo;t have kids, but might want them",straight,likes dogs and likes cats,agnosticism and very serious about it,m,gemini,sometimes,english,single
1,35,average,mostly other,often,sometimes,working on space camp,i am a chef: this is what that means.<br />\n1...,dedicating everyday to being an unbelievable b...,being silly. having ridiculous amonts of fun w...,,...,"oakland, california","doesn&rsquo;t have kids, but might want them",straight,likes dogs and likes cats,agnosticism but not too serious about it,m,cancer,no,"english (fluently), spanish (poorly), french (...",single
2,38,thin,anything,socially,,graduated from masters program,"i'm not ashamed of much, but writing public te...","i make nerdy software for musicians, artists, ...",improvising in different contexts. alternating...,my large jaw and large glasses are the physica...,...,"san francisco, california",,straight,has cats,,m,pisces but it doesn&rsquo;t matter,no,"english, french, c++",available
3,23,thin,vegetarian,socially,,working on college/university,i work in a library and go to school. . .,reading things written by old dead people,playing synthesizers and organizing books acco...,socially awkward but i do my best,...,"berkeley, california",doesn&rsquo;t want kids,straight,likes cats,,m,pisces,no,"english, german (poorly)",single
4,29,athletic,,socially,never,graduated from college/university,hey how's it going? currently vague on the pro...,work work work work + play,creating imagery to look at:<br />\nhttp://bag...,i smile a lot and my inquisitive nature,...,"san francisco, california",,straight,likes dogs and likes cats,,m,aquarius,no,english,single


In [3]:
print(len(df)) #number of profiles in dataset

59946


In [4]:
df.columns

Index(['age', 'body_type', 'diet', 'drinks', 'drugs', 'education', 'essay0',
       'essay1', 'essay2', 'essay3', 'essay4', 'essay5', 'essay6', 'essay7',
       'essay8', 'essay9', 'ethnicity', 'height', 'income', 'job',
       'last_online', 'location', 'offspring', 'orientation', 'pets',
       'religion', 'sex', 'sign', 'smokes', 'speaks', 'status'],
      dtype='object')

In [5]:
average_age = df['age'].mean()
average_age #average age of a user

32.3402895939679

In [6]:
df['status'].value_counts()

single            55697
seeing someone     2064
available          1865
married             310
unknown              10
Name: status, dtype: int64

In [7]:
df['sex'].value_counts()

m    35829
f    24117
Name: sex, dtype: int64

In [8]:
df['orientation'].value_counts()

straight    51606
gay          5573
bisexual     2767
Name: orientation, dtype: int64

In [9]:
import re #deleting html marks
CLEAN1 = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});') 
df['essay0'] = df['essay0'].str.replace(CLEAN1, '', regex=True) 
df['essay0'].head()

0    about me:\n\ni would love to think that i was ...
1    i am a chef: this is what that means.\n1. i am...
2    i'm not ashamed of much, but writing public te...
3            i work in a library and go to school. . .
4    hey how's it going? currently vague on the pro...
Name: essay0, dtype: object

In [10]:
df['essay0'] = df['essay0'].astype(str)

In [11]:
CLEAN2 = re.compile(r'\n[0-9]?') #deleting '\nx' marks
df['essay0'] = df['essay0'].str.replace(CLEAN1, '', regex=True)
df['essay0'].head()

0    about me:\n\ni would love to think that i was ...
1    i am a chef: this is what that means.\n1. i am...
2    i'm not ashamed of much, but writing public te...
3            i work in a library and go to school. . .
4    hey how's it going? currently vague on the pro...
Name: essay0, dtype: object

now i just have to do this with all users description

In [12]:
for n in range(2,10):
    number = str(n)
    name = 'essay' + number
    df[name] = df[name].str.replace(CLEAN1, '', regex=True)
    df[name] = df[name].str.replace(CLEAN2, '', regex=True) 

### 1. Predicting sex based on description of the profile

In [13]:
for n in range(1,10):
    number = str(n)
    name = 'essay' + number
    df[name] = df[name].fillna(' ')

In [14]:
df['sex'].isnull().sum() #checking for null values in sex column

0

In [33]:
sex = df['sex']

In [34]:
df['profile_description'] = df['essay0'] + df['essay1'] + df['essay2'] + df['essay3'] + df['essay4'] + df['essay5'] + df['essay6']  + df['essay7'] + df['essay8'] + df['essay9']
CLEAN3 = re.compile('\n[a-z]?') 
df['profile_description'] = df['profile_description'].str.replace(CLEAN3, '', regex=True) 

In [35]:
profiles_description = df['profile_description'] #x of machine learning model

In [36]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(profiles_description)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(profiles_description, sex ,random_state=42)

In [37]:
train_counts = vectorizer.transform(X_train)
test_counts = vectorizer.transform(X_test)

In [38]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(train_counts,y_train)
classifier.score(test_counts,y_test)

0.7467805431373857

## status

In [39]:
df['status'].isnull().sum()

0

In [40]:
status = df['status']
X_train, X_test, y_train, y_test = train_test_split(profiles_description, status ,random_state=42)
train_counts = vectorizer.transform(X_train)
test_counts = vectorizer.transform(X_test)

In [41]:
classifier_status = MultinomialNB()
classifier_status.fit(train_counts, y_train)

MultinomialNB()

In [42]:
classifier_status.score(test_counts, y_test)

0.9244678721558685

In [43]:
text = ['I have wife']
text = vectorizer.transform(text)
classifier_status.predict(text)

array(['single'], dtype='<U14')

## sign

In [46]:
df['sign'].isnull().sum()

11056

ValueError: Input contains NaN