In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../Assets/A/train.csv', index_col='Unnamed: 0')

In [3]:
df.columns

Index([u'age', u'body_type', u'diet', u'drinks', u'drugs', u'education',
       u'essay0', u'essay1', u'essay2', u'essay3', u'essay4', u'essay5',
       u'essay6', u'essay7', u'essay8', u'essay9', u'ethnicity', u'height',
       u'income', u'job', u'last_online', u'location', u'offspring',
       u'orientation', u'pets', u'religion', u'sex', u'sign', u'smokes',
       u'speaks', u'status'],
      dtype='object')

In [4]:
df2 = pd.DataFrame(index=df.index)

In [5]:
df2['age'] = df.age

In [6]:
# Diet


# ignore anything and other
# just look for vegetarian, vegan, kosher, halal

def diet_encoder(diet):
    try:
        if diet.find('vegan') >= 0:
            return 'vegan'
        elif diet.find('vegetarian') >= 0:
            return 'vegetarian'
        elif diet.find('kosher') >= 0:
            return 'kosher'
        elif diet.find('halal') >= 0:
            return 'halal'
        else:
            return 'anything'
    except:
        return 'anything'

    
df2['diet'] = df.diet.apply(diet_encoder)

In [7]:
# drinking

def drinks_encoder(drinks):
    if drinks == 'socially':
        return drinks
    elif drinks == 'not at all' or drinks == 'rarely':
        return 'not much'
    elif type(drinks) == str:
        return 'heavily'
    else:
        return 'no report'
    
df2['drinks'] = df.drinks.apply(drinks_encoder)

In [8]:
def drug_encoder(drugs):
    if drugs == 'never':
        return 'non-user'
    elif drugs == 'sometimes' or drugs == 'often':
        return 'user'
    else:
        return 'no report'
    
df2['drugs'] = df.drugs.apply(drug_encoder)

In [9]:
def ed_encoder(ed):
    # a person is a college grad if they either "graduated from college/university"
    # or mention law school, med, school, masters program or ph. d program (all instances of the word program are graduate )
    try:
        if ed == 'graduated from college/university' or ed.find('law') >= 0 or ed.find('med') >= 0 or ed.find('program') >= 0:
            return 'college grad'
        # space camp answers are facetious and must be excluded
        # BTW I am in space camp right now
        elif ed.find('space camp') >= 0:
            return 'space camp educated'
        else: return 'not college grad'
    except:
        return 'no report'

df2['education'] = df.education.apply(ed_encoder)

In [10]:
# skip ethnicity come back?

In [11]:
df2['income'] = df.income

In [12]:
def job_encoder(job):
    # return only most popular 10 categories or other
    if job in df.job.value_counts()[1:11].index:
        return job
    else: return 'other'
    
df2['job'] = df.job.apply(job_encoder)


In [13]:
df2['orientation'] = df.orientation

In [14]:
# skip pets because of overlap between dog people and cat people

In [15]:
def religion_encoder(rel):
    try:
        return rel.split(' ', 1)[0]
    except:
        return "no report"

    
df2['religion'] = df.religion.apply(religion_encoder)

In [16]:
levels = []
for religion in df.religion.value_counts().index:
    try:
        level = religion.split('and ', 1)[1]
    except: continue
    if level not in levels:
        levels.append(level)

def level_encoder(rel):
    try:
        for level in levels:
            if rel.find(level) >0:
                return level
    except:
        return 'no report'

df2['religiousness'] = df.religion.apply(level_encoder)

In [17]:
df2['sex'] = df['sex']

In [18]:
def smoking_encoder(smokes):
    if smokes == 'no':
        return 'no'
    else:
        # anyone who reports smoking at all ever or does not report will be saved as a yes
        # smokers are more likely to not answer or to downplay how much they smoke
        return 'yes'
    
df2['smokes'] = df.smokes.apply(smoking_encoder)

In [19]:
df2.to_csv('../Assets/Tableau/encoded_categories.csv')

In [20]:
# Save dataframe for every pairing of features in df2
# Record number of profiles with each permutation of each category

# List all unique pairings of columns
# There are 11 columns altogether

# For each pairing generate df and save directly to file
# then write over same df 
for i in range(11):
    for j in range(i+1, 11):
        feature1 = df2.columns[i]
        feature2 = df2.columns[j]
        
        feature_list = []
        
        # Count the number of profiles with each permutation of feature1 and feature2
        for cat1 in df2[feature1].unique():
            for cat2 in df2[feature2].unique():
                count = df2[df2[feature1]==cat1][df2[feature2]==cat2].shape[0]
                lst = [cat1, cat2, count]
                
                feature_list = feature_list + [lst]
                
        # Save feature list as df then save to .csv
        df3 = pd.DataFrame(feature_list, columns=[feature1, feature2, 'count'])
        
        df3.to_csv('../Assets/Tableau/Comparisons/%s_%s.csv' %(feature1, feature2))




In [21]:
# Save tfidf scores for the 60 most popular words in df

from sklearn.feature_extraction.text import TfidfVectorizer

#import one_long_essay to avoid compiling essays again

okc = pd.read_csv('../Assets/A/one_long_essay.csv')

okc.essays = okc.essays.replace(np.nan, '')

vec = TfidfVectorizer(encoding='utf-8', stop_words='english', max_features=20)
tf = vec.fit_transform(okc.essays)

In [22]:
tf = pd.DataFrame(tf.toarray(), columns=vec.get_feature_names())
df3 = pd.concat([df2, tf], axis=1)

In [23]:
df3.to_csv('../Assets/Tableau/encoded_tfidf.csv')