# Data Filter
### The purpose of this notebook is four-fold:
1) Filter data to only the relevant rows

2) Delete the unnecessary columns

3) Suitably edit the text to allow for topic modeling

4) Create new variables to assist with demographic comparisons of topics


In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
# For Data Cleaning
from bs4 import BeautifulSoup
from split_utils import *
from text_complexity_utils import get_npoly, get_flesch

#General Imports
from tqdm import tqdm
tqdm.pandas()

In [2]:
#correct subset of data
df = pd.read_csv('../profiles.csv/profiles.csv')
df = df[(df['sex']=="m")
        &(df['orientation']=="straight") 
        & (df['status']=="single")]

In [3]:
df.shape

(29163, 31)

In [4]:
# Some of the essays have just a link in the text. BeautifulSoup sees that and gets 
# the wrong idea. This line hides those warnings.
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')
def clean(text):
    """
    Removes all null values
    """
    if pd.isnull(text):
        t = np.nan
    else:
        t = BeautifulSoup(text, 'lxml').get_text()
        t = t.lower()
        t = t.strip().replace('\n','').replace("\r", " ").replace('\t', '')
        bad_words = ['http', 'www', '\nnan']

        for b in bad_words:
            t = t.replace(b, '')
    if t == '':
        t = np.nan
    
    return t

#Clearing out all HTML and unnecessary characters
df['essay0'] = df['essay0'].progress_apply(clean)

100%|██████████████████████████████████████████████████████████████████████████| 29163/29163 [00:12<00:00, 2302.66it/s]


In [5]:
must_haves = ['body_type', 'height', 'education', 'ethnicity', 'sex', 'essay0']
df = df.dropna(subset= must_haves)
df = df.drop(columns=['essay1', 'essay2', 'essay3','essay4','essay5','essay6','essay7',
                      'essay8', 'essay9', 'income','job','last_online','location','offspring',
                      'orientation','pets','religion','sex','sign','smokes','speaks','status',
                      'diet', 'drinks', 'drugs'])

In [6]:
df.shape

(20576, 6)

### Fix Conjoined Words

### CREATING NEW COLUMNS


Many of the sections here are taken directly from the following link, with specific modifications
Taken directly from:
https://github.com/UM-CSS/CSSLabs-NLP/blob/master/1_Data_munging.ipynb

In [7]:
def recode(text, dictionary, default=np.nan):
    '''Function for recoding categories in a column based on exact matches'''
    out = default
    text = str(text)
    
    for x in dictionary.keys():
        for y in dictionary[x]:
            if y == text: #exact match
                out = x
                return out
    return out

def recode_fuzzy(text, dictionary, default=np.nan):
    '''Function for recoding categories in a column based on partial matches'''
    out = default
    text = str(text)
    
    for x in dictionary.keys():
        for y in dictionary[x]:
            if y in text: #partial match
                out = x
                return out
    return out

In [8]:
ed_levels = {'High School or less': ['dropped out of high school', 'working on high school','graduated from high school', 'working on college/university', 
                    'two-year college', 'dropped out of college/university', 
                    'high school'], 
             'More than High School': ['graduated from college/university', 
                    'working on masters program', 'working on ph.d program', 
                    'college/university', 'working on law school', 
                    'dropped out of masters program', 
                    'dropped out of ph.d program', 'dropped out of law school', 
                    'dropped out of med school',
                    'graduated from masters program',
                    'graduated from ph.d program',                           
                    'graduated from law school', 
                    'graduated from med school', 'masters program', 
                    'ph.d program', 'law school', 'med school']}

#body type
bodies = {'fit': ['fit', 'athletic', 'jacked'], 
          'not_fit': ['average', 'thin', 'skinny','curvey', 'a little extra', 
                      'full figured', 'overweight', 'rather not say', 'used up']
         }

In [9]:
df['edu'] = df.education.apply(recode, dictionary=ed_levels, 
                                            default='unknown')
df['fit'] = df.body_type.apply(recode, dictionary=bodies, 
                                            default='unknown')

In [10]:
# race/ethnicity for exact matching
ethn = {'White': ['white', 'middle eastern', 'middle eastern, white'], 
        'Asian': ['asian', 'indian', 'asian, pacific islander'], 
        'Black': ['black']       }   

# race/ethnicityfor fuzzy matching
ethn2 = {'Latinx': ['latin'], 'multiple': [','], np.nan: ['nan']}

In [11]:
def census_2010_ethnicity(t):
    text = str(t)
    e = recode(text, ethn, default='other')
    if 'other' == e:
        e = recode_fuzzy(text, ethn2, default='other')
    return e

df['race_ethnicity'] = df.ethnicity.apply(census_2010_ethnicity)

In [12]:
def height_check(inches):
    h = 'not_short'
    if inches <= 69:
        h = 'short'
    return h
df['height'] = pd.to_numeric(df['height'])
df['height_group'] = df.height.apply(height_check)

In [13]:
#Now drop the original variables
df.drop(columns=['body_type', 'ethnicity','height','education'], inplace=True)

In [14]:
df.to_csv('profiles_filtered.csv')

In [15]:
df.shape

(20576, 6)

## TEXT EDITING

In [None]:
# First, fix conjoined words in the essay
# This may take up to 10 minutes
df['essay0'] = df['essay0'].progress_apply(split_incorrect)

  1%|▉                                                                        | 272/20576 [1:07:23<99:14:37, 17.60s/it]

In [None]:
df['long_words'] = df['essay0'].progress_apply(get_npoly)

In [None]:
df['flesch'] = df['essay0'].progress_apply(get_flesch)

In [None]:
df.to_csv('compressed_okcupid.csv')