In [62]:
import numpy as np
import math
import pandas as pd
import webcolors
import re
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
nltk.download('wordnet')
import json

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\raych\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [63]:
def featureSelect(init_data):
    #Feature Selection
    #Filter out attributes such as unit_id, golden, judegements that aren't relevant for data mining.
    drop_attributes = ['_unit_id', '_golden', '_unit_state', '_trusted_judgments',
                       '_last_judgment_at','gender:confidence', 'profile_yn',
                       'profile_yn:confidence', 'created','gender_gold','profile_yn_gold',
                       'tweet_created', 'tweet_id']
    init_data = init_data.drop(drop_attributes,axis=1)
    #Remove attributes with too many miss values - tweet_coord
    init_data = init_data.drop(['tweet_coord'],axis=1)
    #Remove rows with missing gender, unknown gender , or is brand gender - lose around 1/3 of the data
        #init_data.isnull().sum()
    init_data = init_data.dropna(subset=['gender'])
    init_data = init_data[init_data.gender != 'brand']
    init_data = init_data[init_data.gender != 'unknown']

    #drop retweet count because only 2.6% of male & female accounts combined have value > 0
    init_data = init_data.drop(['retweet_count'],axis=1)

    #drop profileimage for now & tweet/timezon locations for now
    init_data = init_data.drop(['profileimage'],axis=1)
    init_data = init_data.drop(['tweet_location', 'user_timezone'],axis=1)
    return init_data


In [64]:
def colorReplacement(init_data):
    #replace each hexidecimal link and sidebar color with 3 new attributes being rgb percentages in decimal
    side_bar = init_data['sidebar_color']
    link_bar = init_data['link_color']

    hexi_rgb = []
    for hexi_side in side_bar:
        rgb_val = ('0%','0%','0%')
        if((hexi_side != "0") and (len(hexi_side) == 6)):
            rgb_val = webcolors.hex_to_rgb_percent("#" + hexi_side.lower())
            hexi_rgb.append(rgb_val)
        else:
            hexi_rgb.append(rgb_val)

    link_rgb = []
    for hexi_link in link_bar:
        rgb_val = ('0%','0%','0%')
        if((hexi_link != "0") and (len(hexi_link) == 6)):
            rgb_val = webcolors.hex_to_rgb_percent("#" + hexi_link)
            link_rgb.append(rgb_val)
        else:
            link_rgb.append(rgb_val)

    init_data = init_data.reset_index(drop=True)
    side_colors = pd.DataFrame(hexi_rgb,columns= ['sred','sgreen','sblue'])
    link_colors = pd.DataFrame(link_rgb,columns = ['lred','lgreen','lblue'])
    init_data = init_data.join(side_colors)
    init_data = init_data.join(link_colors)
    init_data['sred'] = init_data['sred'].str.rstrip('%').astype('float') / 100.0
    init_data['sgreen'] = init_data['sgreen'].str.rstrip('%').astype('float') / 100.0
    init_data['sblue'] = init_data['sblue'].str.rstrip('%').astype('float') / 100.0
    init_data['lred'] = init_data['lred'].str.rstrip('%').astype('float') / 100.0
    init_data['lgreen'] = init_data['lgreen'].str.rstrip('%').astype('float') / 100.0
    init_data['lblue'] = init_data['lblue'].str.rstrip('%').astype('float') / 100.0
    init_data = init_data.drop(['sidebar_color','link_color'],axis=1)
    return init_data


In [65]:
def textProcess(init_data, checkifTestSet):
    # Text pre-processing 
    global lemma_intersect 
    global Lemma_femaleWords 
    global Lemma_maleWords
    global stem_intersect 
    global Stem_femaleWords 
    global Stem_maleWords
    #all descriptions that have nan value replace with empty string, text has no null values.
    init_data['description'].fillna('',inplace=True)

    #clean text and description
    description = init_data['description']
    text = init_data['text']
    gender = init_data['gender']
    lemma = WordNetLemmatizer()
    Pstem = PorterStemmer()
    Lemma_descriptionList = []   # List of clean descriptions words for each record
    Lemma_textList = []          # List of clean text words for each record 
    Lemma_femaleWords = set()
    Lemma_maleWords = set()

    Stem_descriptionList = []   
    Stem_textList = []           
    Stem_femaleWords = set()
    Stem_maleWords = set()

    # Remove symbols from stopword list - stopwords obtained from https://www.ranks.nl/stopwords
    stopwords = open("longstopwordList.txt").read().splitlines()
    clean_stopwords = [re.sub(r'[^\w]', '',word) for word in stopwords]
    stopwordsSet = set(clean_stopwords)

    #loop for clean each text attribute in each record
    pattern = "(@[A-Za-z]+)|([^A-Za-z \t])|(\w+:\/\/\S+)|^RT|http.+?"
    #pattern = "(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^RT|http.+?"
    for i in range(len(description)):
        description_sub = re.sub(pattern, '',description[i])  #parse out symbols
        text_sub        = re.sub(pattern,  '',text[i])
        description_sub = description_sub.lower()             #lowercase
        text_sub  = text_sub.lower()
        description_words = description_sub.split()           #tokenize sentence
        text_words = text_sub.split()

        lemma_Dwords = []
        lemma_Twords = []
        stem_Dwords = []
        stem_Twords = []

        # lemmatize & stem each word in description
        for word1 in description_words:
            dword = lemma.lemmatize(word1)
            dword2 = Pstem.stem(word1)

            if(dword not in stopwordsSet):
                lemma_Dwords.append(dword)

            if(dword2 not in stopwordsSet):
                stem_Dwords.append(dword2) 
        Lemma_description = ' '.join(lemma_Dwords)    
        Stem_description = ' '.join(stem_Dwords)  
     # lemmatize & stem each word in text
        for word2 in text_words:
            tword = lemma.lemmatize(word2)
            tword2 = Pstem.stem(word2)
            if(tword not in stopwordsSet):
                lemma_Twords.append(tword)
            if(tword2 not in stopwordsSet):
                stem_Twords.append(tword2)
        Lemma_text =  ' '.join(lemma_Twords)
        Stem_text = ' '.join(stem_Twords)

        Lemma_descriptionList.append(Lemma_description)   
        Lemma_textList.append(Lemma_text)
        Stem_descriptionList.append(Stem_description)
        Stem_textList.append(Stem_text)
        #generate set of male and female words
        Lwordset = set(lemma_Dwords).union(set(lemma_Twords))
        Swordset = set(stem_Dwords).union(set(stem_Twords))
        if(gender[i]  == 'female'):
            Lemma_femaleWords = Lemma_femaleWords.union(Lwordset)
            Stem_femaleWords = Stem_femaleWords.union(Swordset)
        else:
            Lemma_maleWords = Lemma_maleWords.union(Lwordset)
            Stem_maleWords = Stem_maleWords.union(Swordset)
    #end of for loop for cleaning text based attributes 
    lemma_desc = pd.DataFrame(Lemma_descriptionList,columns= ['Lemma_description'])
    lemma_text = pd.DataFrame(Lemma_textList,columns = ['Lemma_text'])
    stem_desc = pd.DataFrame(Stem_descriptionList,columns= ['stem_description'])
    stem_text = pd.DataFrame(Stem_textList,columns = ['stem_text'])
    init_data = init_data.join(lemma_desc)
    init_data = init_data.join(lemma_text)
    init_data = init_data.join(stem_desc)
    init_data = init_data.join(stem_text)
    init_data = init_data.drop(['description','text'],axis=1)

    if(checkifTestSet == False):
        print("Generating word lists")
        lemma_intersect = Lemma_femaleWords.intersection(Lemma_maleWords)
        Lemma_femaleWords = Lemma_femaleWords - lemma_intersect
        Lemma_maleWords = Lemma_maleWords - lemma_intersect

        stem_intersect = Stem_femaleWords.intersection(Stem_maleWords)
        Stem_femaleWords = Stem_femaleWords - stem_intersect
        Stem_maleWords = Stem_maleWords - stem_intersect
        with open('LemmaFemaleWordList.json','w') as fp:
            json.dump(list(Lemma_femaleWords), fp, indent=4)   

        with open('LemmaMaleWordList.json','w') as fp2:
            json.dump(list(Lemma_maleWords), fp2, indent=4)   

        with open('StemFemaleWordList.json','w') as fp3:
            json.dump(list(Stem_femaleWords), fp3, indent=4)   

        with open('StemMaleWordList.json','w') as fp4:
            json.dump(list(Stem_maleWords), fp4, indent=4)

        with open('LemmaIntersectionWord.json','w') as fp5:
            json.dump(list(lemma_intersect), fp5, indent=4)

        with open('StemIntersectionWord.json','w') as fp6:
            json.dump(list(stem_intersect), fp6, indent=4)  
    return init_data

In [66]:
def generateGenderTextCount(init_data):
# text processing cont.  -generate the 6 new attributes - 2 for intersection of male &
# female words , 2 for female words, 2 for male words based on lemmatization & stemming 
    lemmaMaleCount = []
    lemmeFemaleCount = []
    lemmaIntersectCount = []
    stemMaleCount = []
    stemFemaleCount = []
    stemIntersectCount = []

    lemDescription = init_data['Lemma_description']
    lemText = init_data['Lemma_text']
    sDescription = init_data['stem_description']
    sText = init_data['stem_text']

    for x in range(len(lemDescription)):
        Ldescription_words = lemDescription[x].split()           #tokenize sentence
        Ltext_words = lemText[x].split()
        text_data = Ldescription_words+Ltext_words
        LmaleCount = 0
        LfemaleCount = 0
        Lintersect = 0
        for word in text_data:
            if word in lemma_intersect:
                Lintersect+=1
            elif word in Lemma_femaleWords:
                LfemaleCount+=1
            elif word in Lemma_maleWords:
                LmaleCount+=1
        lemmaMaleCount.append(LmaleCount)
        lemmeFemaleCount.append(LfemaleCount)
        lemmaIntersectCount.append(Lintersect)


    for m in range(len(sDescription)):
        Sdescription_words = sDescription[m].split() 
        Stext_words = sText[m].split()
        stext_data = Sdescription_words+Stext_words
        SmaleCount = 0
        SfemaleCount = 0
        Sintersect = 0
        for word in stext_data:
            if word in stem_intersect:
                Sintersect+=1
            elif word in Stem_femaleWords:
                SfemaleCount+=1
            elif word in Stem_maleWords:
                SmaleCount+=1
        stemMaleCount.append(SmaleCount)
        stemFemaleCount.append(SfemaleCount)
        stemIntersectCount.append(Sintersect)
    lemma_male = pd.DataFrame(lemmaMaleCount,columns= ['LMalecount'])
    lemma_female = pd.DataFrame(lemmeFemaleCount,columns = ['LFemalecount'])
    lemma_inter = pd.DataFrame(lemmaIntersectCount,columns= ['Lintersectcount'])
    init_data = init_data.join(lemma_male)
    init_data = init_data.join(lemma_female)
    init_data = init_data.join(lemma_inter)

    stem_male = pd.DataFrame(stemMaleCount,columns= ['SMalecount'])
    stem_female = pd.DataFrame(stemFemaleCount,columns= ['SFemalecount'])
    stem_inter = pd.DataFrame(stemIntersectCount,columns= ['Sintersectcount'])
    init_data = init_data.join(stem_male)
    init_data = init_data.join(stem_female)
    init_data = init_data.join(stem_inter)
    return init_data

In [67]:
def nameProcessing(init_data):
# name pre-processing - Generate 2 new attributes for number of vowels and constants from name
#last letter of female & male names obtained from https://home.uchicago.edu/~jsfalk/misc/baby_names/
    names = init_data['name']
    pattern = "(@[A-Za-z]+)|([^A-Za-z \t])|(\w+:\/\/\S+)|^RT|http.+?"
    nameCount = []
    for name in names:   
        clean_name = (re.sub(pattern, '',name)).lower()
        clean_name = clean_name.replace(" ", "")
        vowel=len([letter for letter in clean_name if letter in "aeiou"])
        constant=len([letter for letter in clean_name if letter not in "aeiou"])
        nameCount.append((vowel,constant))
    CountofNames = pd.DataFrame(nameCount,columns= ['vowel','constant'])
    init_data = init_data.join(CountofNames)
    return init_data

In [68]:
def replacements(init_data):
    # For numeric attributes - apply log function to reduce range and prevent data skew
    # replace all 0's with 1's to prevent div by 0 error - still get 0.
    init_data.fav_number.replace(0,1, inplace=True)
    init_data.tweet_count.replace(0,1, inplace=True)
    init_data['fav_number'] = np.log(init_data.fav_number)
    init_data['tweet_count'] = np.log(init_data.tweet_count)
    return init_data



In [69]:
# Load & clean in training data for cleaning & generating word sets
training_data = pd.read_csv("gender-classifier-DFE-791531.csv",engine="python");
column_names = training_data.iloc[0]
training_data.rename(index=str, columns=column_names)
lemma_intersect = set()
Lemma_femaleWords =set()
Lemma_maleWords =set()
stem_intersect =set()
Stem_femaleWords =set()
Stem_maleWords = set()
training_data = featureSelect(training_data)
training_data = colorReplacement(training_data)
training_data = textProcess(training_data,False)
training_data = generateGenderTextCount(training_data)
training_data = nameProcessing(training_data)
training_data = replacements(training_data)

#Write back all new data to csv
training_data.to_csv('clean_data.csv', encoding='utf-8', index=False)

Generating word lists


In [70]:
# Load & clean real dataset
# Mofidy this line to file name of file being tested
test_set = pd.read_csv('unclean_sample.csv',engine="python")
column_names = test_set.iloc[0]
test_set.rename(index=str, columns=column_names)
test_set = featureSelect(test_set)
test_set = colorReplacement(test_set)
test_set = textProcess(test_set,True)
test_set = generateGenderTextCount(test_set)
test_set = nameProcessing(test_set)
test_set = replacements(test_set)
test_set.to_csv('realdata.csv', encoding='utf-8', index=False)


In [71]:
female_data = training_data[(training_data.gender == 'female')]
fmean = female_data.mean()
male_data = training_data[(training_data.gender == 'male')]
mmean = male_data.mean()
print(fmean)
print(mmean)

fav_number          6.637782
tweet_count         8.881898
sred                0.620678
sgreen              0.666835
sblue               0.689925
lred                0.314044
lgreen              0.426762
lblue               0.566201
LMalecount          0.000000
LFemalecount        2.172687
Lintersectcount     9.214030
SMalecount          0.000000
SFemalecount        1.914328
Sintersectcount    10.102090
vowel               4.064030
constant            6.528060
dtype: float64
fav_number          6.214485
tweet_count         8.837328
sred                0.625360
sgreen              0.687410
sblue               0.711276
lred                0.192417
lgreen              0.440350
lblue               0.568305
LMalecount          2.718760
LFemalecount        0.000000
Lintersectcount    10.312561
SMalecount          2.371650
SFemalecount        0.000000
Sintersectcount    11.278495
vowel               3.551179
constant            6.501614
dtype: float64


In [72]:
#generate sample unclean file to test functions
# unclean = pd.read_csv("gender-classifier-DFE-791531.csv",engine="python");
# unclean2 = unclean.head(10000)
# unclean.to_csv('unclean_sample.csv', encoding='utf-8', index=False)

In [73]:
#Everything below is for beginning analysis purposes
# training_data = pd.read_csv("gender-classifier-DFE-791531.csv",engine="python");
# column_names = training_data.iloc[0]
# training_data.rename(index=str, columns=column_names)
# #check na values
# training_data.isnull().sum()
# drop_attributes = ['_unit_id', '_golden', '_unit_state', '_trusted_judgments',
#                    '_last_judgment_at','gender:confidence', 'profile_yn',
#                    'profile_yn:confidence', 'created','gender_gold','profile_yn_gold',
#                    'tweet_created', 'tweet_id']
# training_data = training_data.drop(drop_attributes,axis=1)
# #Remove attributes with too many miss values - tweet_coord
# training_data = training_data.drop(['tweet_coord'],axis=1)
# #Remove rows with missing gender, unknown gender , or is brand gender - lose around 1/3 of the data
# training_data = training_data.dropna(subset=['gender'])
# training_data = training_data[training_data.gender != 'brand']
# training_data = training_data[training_data.gender != 'unknown']

# # #drop retweet count because only 2.6% of male & female accounts combined have value > 0
# # training_data = training_data.drop(['retweet_count'],axis=1)
# training_data.shape

In [74]:
# print(training_data['fav_number'].max())
# print(training_data['tweet_count'].max())