In [2]:
import pandas as pd
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.corpus import words as nltk_words
from sklearn.feature_extraction.text import TfidfVectorizer , CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline , FeatureUnion
from sklearn.metrics import *
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
import re
import sys
import string
from time import time
import numpy as np
import itertools
from sklearn import metrics
DATASET_PATH = 'gender-classifier-DFE-791531.csv'
stop = set(stopwords.words('english'))
tokenizer = TweetTokenizer()



In [3]:
# data = pd.read_csv(DATASET_PATH, encoding='utf-8',  index_col=False)#sep='\t' header=None,
data = pd.read_csv(DATASET_PATH, encoding='latin1',  index_col=False)#sep='\t' header=None,

df = pd.DataFrame(data)

#filtering out some columns that we will not use:
df.drop(['_golden','_unit_state','_trusted_judgments','_last_judgment_at','profile_yn','profile_yn:confidence',
        'created','fav_number','gender_gold','profile_yn_gold','profileimage','retweet_count','tweet_coord',
        'tweet_count','tweet_created','tweet_id','tweet_location','user_timezone'], axis=1,inplace=True)

df.head(10)

Unnamed: 0,_unit_id,gender,gender:confidence,description,link_color,name,sidebar_color,text
0,815719226,male,1.0,i sing my own rhythm.,08C2C2,sheezy0,FFFFFF,Robbie E Responds To Critics After Win Against...
1,815719227,male,1.0,I'm the author of novels filled with family dr...,0084B4,DavdBurnett,C0DEED,â°ï¢Ö¿It felt like they were my friends and ...
2,815719228,male,0.6625,louis whining and squealing and all,ABB8C2,lwtprettylaugh,C0DEED,i absolutely adore when louis starts the songs...
3,815719229,male,1.0,"Mobile guy. 49ers, Shazam, Google, Kleiner Pe...",0084B4,douggarland,C0DEED,Hi @JordanSpieth - Looking at the url - do you...
4,815719230,female,1.0,Ricky Wilson The Best FRONTMAN/Kaiser Chiefs T...,3B94D9,WilfordGemma,0,Watching Neighbours on Sky+ catching up with t...
5,815719231,female,1.0,you don't know me.,F5ABB5,monroevicious,0,"Ive seen people on the train with lamps, chair..."
6,815719232,brand,1.0,"A global marketplace for images, videos and mu...",298AAE,Shutterstock,0,@BpackEngineer Thank you for your patience whi...
7,815719233,male,1.0,The secret of getting ahead is getting started.,0000FF,RobinMeske,C0DEED,Gala Bingo clubs bought for ×Â£241m: The UK's...
8,815719234,female,1.0,Pll Fan // Crazy about MCD // Ramen is bae,9266CC,pigzilla_,0,@_Aphmau_ the pic defines all mcd fangirls/fan...
9,815719235,female,1.0,"Renaissance art historian, University of Notti...",9266CC,GabrieleNeher,FFFFFF,@Evielady just how lovely is the tree this yea...


In [4]:
def fix_nan_description(description):
    if type(description) is float:
        return ''
#     if description.encode('latin-1').strip() == 'nan':
    if description.strip() == 'nan':
        return ''
    return description

#for example:
fix_nan_description('nan ')

''

In [5]:
def stem(text):
    tokens = tokenizer.tokenize(text)
    ps = PorterStemmer()
    
    tokens = [ps.stem(token) for token in tokens]
    
    return ' '.join(tokens)

#for example:
stem('regardless of context, their affection seemed non-existant')

def remove_stop_words(text):
    tokens = tokenizer.tokenize(text)
    stop = set(stopwords.words('english'))
    
    tokens = [token for token in tokens if token not in stop]
    
    return ' '.join(tokens)

#for example:
remove_stop_words('regardless of context, their affection seemed non-existant')

def remove_punctuation(text):
    tokens = [token for token in tokenizer.tokenize(text) if token not in string.punctuation]
    return ' '.join(tokens)


def remove_url(text):    
    return re.sub(r'https?:\S+', '', text)

#for example:
remove_url('for more info on http, enter http://www.http.com (lo beshabat)')

def remove_hashtags(text,mode):  
    if mode == 'entire_expression':
        return re.sub(r'#\S+', '', text)
    elif mode == 'only_symbol':
        return text.replace('#','')
    else:
        return text

#for example:
remove_hashtags('what did you think about the # of voters this year? use #VotersUnite to let us know.','entire_expression')

remove_hashtags('#fridaymadness #nature #fun #outdoor #originalperson','only_symbol')

def remove_ats(text,mode):
    if mode == 'entire_expression':
        return re.sub(r'@\S+', '', text)
    elif mode == 'only_symbol':
        return str(text).replace('@','')
    else:
        return text
    
#for example:
remove_ats('@ameliaearhart vs @WrightBrothers in an all exclusive fly-off','entire_expression')

remove_ats('@ameliaearhart vs @WrightBrothers in an all exclusive fly-off','only_symbol')

def remove_multiple_spaces(text):    
    return re.sub(r' +', ' ', text).strip()

#for example:
remove_multiple_spaces(' @ameliaearhart       vs   @WrightBrothers in an all   exclusive fly-off ')

'@ameliaearhart vs @WrightBrothers in an all exclusive fly-off'

In [6]:
def preprocess(df, min_confidence=0.9, should_lower=False, should_stem=False, should_remove_stop_words=False, should_remove_url=False,
                            should_remove_punctuation=False,should_remove_hashtags='none',should_remove_ats='none'):
    
    #basic cleaning
    cleaner_df = df.loc[(df['gender'] != 'nan') & (df['text'] != 'nan') & (df['gender'] != 'unknown')]
    
    cleaner_df = cleaner_df[df['gender:confidence'] >= min_confidence]
            
    
    
    cleaner_df['description'] = [fix_nan_description(desc) for desc in cleaner_df['description']]

    
    #################################
    #removing spam tweets:
    cleaner_df.drop_duplicates(subset='text', keep='first', inplace=True)
    ##################################
    

    #todo probably sucks
    if should_lower:
        cleaner_df['description'] = [desc.lower() for desc in cleaner_df['description']]
        cleaner_df['text'] = [text.lower() for text in cleaner_df['text']]

    #todo check this (probably sucks so leave it out)
    if should_remove_punctuation:
        cleaner_df['description'] = [remove_punctuation(desc) for desc in cleaner_df['description']]
        cleaner_df['text'] = [remove_punctuation(text) for text in cleaner_df['text']]

    if should_remove_url:
        cleaner_df['description'] = [remove_url(desc) for desc in cleaner_df['description']]
        cleaner_df['text'] = [remove_url(text) for text in cleaner_df['text']]
        
    if should_remove_hashtags!='none':
        cleaner_df['description'] = [remove_hashtags(desc,should_remove_hashtags) for desc in cleaner_df['description']]
        cleaner_df['text'] = [remove_hashtags(text,should_remove_hashtags) for text in cleaner_df['text']]
        
    if should_remove_ats!='none':
        cleaner_df['description'] = [remove_ats(desc,should_remove_ats) for desc in cleaner_df['description']]
        cleaner_df['text'] = [remove_ats(text,should_remove_ats) for text in cleaner_df['text']]
          
    if should_remove_stop_words:
        cleaner_df['description'] = [remove_stop_words(desc) for desc in cleaner_df['description']]
        cleaner_df['text'] = [remove_stop_words(text) for text in cleaner_df['text']]


    if should_stem:
        cleaner_df['description'] = [stem(desc) for desc in cleaner_df['description']]
        cleaner_df['text'] = [stem(text) for text in cleaner_df['text']]
        

    #remove multiple spaces
    cleaner_df['description'] = [remove_multiple_spaces(desc) for desc in cleaner_df['description']]
    cleaner_df['text'] = [remove_multiple_spaces(text) for text in cleaner_df['text']]
    cleaner_df['text_desc'] = [desc+' '+text for desc,text in zip(cleaner_df['description'],cleaner_df['text'])]

    return cleaner_df

In [7]:
clean_df = preprocess(df)
print('Number of records: {}'.format(len(clean_df['text'])))

Number of records: 13165


  import sys


In [8]:
import keras
import keras.preprocessing.text as kpt
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation

#same split as with the above model
x_train = clean_df['text_desc'][:int(0.9*len(clean_df))]

gender_to_int = {'female':0,'male':1,'brand':2}

y_train = list([gender_to_int[gender] for gender in clean_df['gender'][:int(0.9*len(clean_df))]]) 
y_train = keras.utils.to_categorical(y_train, 3)


Using Theano backend.


ModuleNotFoundError: No module named 'theano'

In [None]:
N_WORDS = 10000

keras_tokenizer = kpt.Tokenizer(filters='', lower=True,split = ' ',num_words=N_WORDS)#configuring the keras tokenizer to do close to nothing in terms of tokenization

texts_for_keras_tokenizer = list([' '.join(tokenizer.tokenize(text)) for text in x_train]) 

keras_tokenizer.fit_on_texts(texts_for_keras_tokenizer)

x_train = keras_tokenizer.texts_to_matrix(texts_for_keras_tokenizer, mode='binary')


#todo optimize this
model = Sequential()#initialize the NN

#add a layer that gets N_WORDS inputs (this is the BOW representation of any tweet) and outputs 512 values ()
model.add(Dense(512, input_shape=(N_WORDS,), activation='relu'))

model.add(Dropout(0.5))


model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))


model.add(Dense(3, activation='softmax'))# this gives a distribution over all 3 genders (probability of the input's belonging to any of the gender)
model.compile(loss='categorical_crossentropy',  optimizer='adam',  metrics=['accuracy']) 
# model.compile(loss='categorical_crossentropy',  optimizer='rmsprop',  metrics=['accuracy'])

#we are using the x,y we constructed earlier,
model.fit(x_train, y_train,  batch_size=32,  epochs=10,  verbose=1,  validation_split=0.1,  shuffle=True)


In [None]:
x_test = clean_df['text_desc'][int(0.9*len(clean_df)):]

y_test = list([gender_to_int[gender] for gender in clean_df['gender'][int(0.9*len(clean_df)):]]) 

texts_for_keras_tokenizer = list([' '.join(tokenizer.tokenize(text)) for text in x_test]) 


x_test = keras_tokenizer.texts_to_matrix(texts_for_keras_tokenizer, mode='binary')

keras_pred = model.predict(x_test)

In [None]:
int_to_gender = {0: 'female',1:'male',2:'brand'}

actual_pred = list([np.argmax(values) for values in keras_pred])

#todo write things
print(classification_report(y_test, actual_pred))
print('Accuracy score: {}'.format(accuracy_score(y_test, actual_pred)))

In [None]:
def preprocessing_combinations(df_training, df_test):
    #this produces a list of tuples which are all the possible arrangements of True,False among the cleaning steps we will use
    parameter_possibilities = list(itertools.product([0, 1], repeat=7))
    
    lower_possibility_index = 0
    stem_possibility_index = 1
    remove_stop_words_possibility_index = 2
    remove_url_possibility_index = 3
    remove_punctuation_possibility_index = 4
    remove_hashtags_possibility_index = 5
    remove_ats_possibility_index = 6

    #this will be a list of tuples.
    res = []
    
    for possibility in parameter_possibilities:
        is_lower = bool(possibility[lower_possibility_index])
        is_stemming = bool(possibility[stem_possibility_index])
        is_remove_stop_words = bool(possibility[remove_stop_words_possibility_index])
        is_remove_url = bool(possibility[remove_url_possibility_index])
        is_remove_punctuation = bool(possibility[remove_punctuation_possibility_index])
        is_remove_hashtags = bool(possibility[remove_hashtags_possibility_index])
        is_remove_hashtags = 'only_symbol' if is_remove_hashtags else 'none'
        is_remove_ats = bool(possibility[remove_ats_possibility_index])
        is_remove_ats = 'entire_expression' if is_remove_ats else 'none'

        
        current_training  = preprocess(df_training, 
                                       should_lower=is_lower,
                                       should_stem=is_stemming,
                                       should_remove_stop_words=is_remove_stop_words,
                                       should_remove_url=is_remove_url,
                                       should_remove_punctuation=is_remove_punctuation,
                                       should_remove_hashtags = is_remove_hashtags,
                                       should_remove_ats = is_remove_ats)
        current_test  = preprocess(df_test, 
                                       should_lower=is_lower,
                                       should_stem=is_stemming,
                                       should_remove_stop_words=is_remove_stop_words,
                                       should_remove_url=is_remove_url,
                                       should_remove_punctuation=is_remove_punctuation,
                                       should_remove_hashtags = is_remove_hashtags,
                                       should_remove_ats = is_remove_ats)

        current_tag= '\nlower : ' + str(is_lower)
        current_tag+= '\nstemming : ' + str(is_stemming)
        current_tag+= '\nremove stop words : '+str(is_remove_stop_words)
        current_tag+= '\nremove urls : '+str(is_remove_url)
        current_tag+= '\nremove punctuation : '+str(is_remove_punctuation)
        current_tag+= '\nremove hashtags : '+str(is_remove_hashtags)
        current_tag+= '\nremove ats : '+str(is_remove_ats)

        tagged_clean_dataset = (current_training , current_test , current_tag)
        
        res.append(tagged_clean_dataset)
    
    return res

In [None]:
training_df = clean_df[:int(0.9*len(clean_df))]#note that the df we use here was cleaned only from spam and irrelevant records. we will now clean it further
test_df = clean_df[int(0.9*len(clean_df)):]

combinations = preprocessing_combinations(training_df,test_df)


In [None]:
i=0
dl_exact_accuracies = []
dl_precisions = []
dl_recalls = []
dl_f_measures = []
N_WORDS = 10000
gender_to_int = {'female':0,'male':1,'brand':2}
int_to_gender = {0: 'female',1:'male',2:'brand'}
for combination in combinations:
    x_train_df = combination[0]
    x_train = x_train_df['text_desc']
    y_train = combination[0]['gender']
    y_train = list([gender_to_int[gender] for gender in y_train]) 
    y_train = keras.utils.to_categorical(y_train, 3)
    x_test_df = combination[1]
    x_test = x_test_df['text_desc']
    y_test = combination[1]['gender']
    y_test = list([gender_to_int[gender] for gender in y_test]) 
#     y_test = keras.utils.to_categorical(y_test, 3)
    tag = combination[2]

    keras_tokenizer = kpt.Tokenizer(filters='',split = ' ',num_words=N_WORDS)#configuring the keras tokenizer to do close to nothing in terms of tokenization

    texts_for_keras_tokenizer = list([' '.join(tokenizer.tokenize(text)) for text in x_train]) 

    keras_tokenizer.fit_on_texts(texts_for_keras_tokenizer)

    x_train = keras_tokenizer.texts_to_matrix(texts_for_keras_tokenizer, mode='binary')


    #todo optimize this
    model = Sequential()#initialize the NN

    #add a layer that gets N_WORDS inputs (this is the BOW representation of any tweet) and outputs 512 values ()
    model.add(Dense(512, input_shape=(N_WORDS,), activation='relu'))

    model.add(Dropout(0.5))


    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))


    model.add(Dense(3, activation='softmax'))# this gives a distribution over all 3 genders (probability of the input's belonging to any of the gender)
    model.compile(loss='categorical_crossentropy',  optimizer='adam',  metrics=['accuracy']) 
    
    
    print('Training with combination number '+str(i)+'/'+str(len(combinations)))
    
    
    details = '\n\nPreprocessing parameters:\n'+tag
    

    t0 = time()
    model.fit(x_train, y_train,  batch_size=32,  epochs=7,  verbose=1,  validation_split=0.1,  shuffle=True)
    train_time = time() - t0
    details+="\nFeature extraction + training time: %0.3fs" % train_time
    
    t0 = time()
    
#     pred = baseline.predict(x_test)
    texts_for_keras_tokenizer = list([' '.join(tokenizer.tokenize(text)) for text in x_test]) 
    x_test = keras_tokenizer.texts_to_matrix(texts_for_keras_tokenizer, mode='binary')

    keras_pred = model.predict(x_test)

    actual_pred = list([np.argmax(values) for values in keras_pred])

    test_time = time() - t0
    details+="\nTest time:  %0.3fs" % test_time

    exact_accuracy = metrics.accuracy_score(y_test, actual_pred)
    details+="\n\nAccuracy:   %0.3f" % exact_accuracy

    details+='\nMicro-averaged metrics: '

    precision_micro = metrics.precision_score(y_test, actual_pred, average='micro')
    details+="\nPrecision:   %0.3f" % precision_micro

    recall_micro = metrics.recall_score(y_test, actual_pred, average='micro')
    details+="\nRecall:   %0.3f" % recall_micro

    f_measure_micro = metrics.f1_score(y_test, actual_pred, average='micro')                              
    details+="\nF_measure:   %0.3f" % f_measure_micro

    details+='\nMacro-averaged metrics: '

    precision_macro = metrics.precision_score(y_test, actual_pred, average='macro')
    details+="\nPrecision:   %0.3f" % precision_macro

    recall_macro = metrics.recall_score(y_test, actual_pred, average='macro')
    details+="\nRecall:   %0.3f" % recall_macro

    f_measure_macro = metrics.f1_score(y_test, actual_pred, average='macro')                              
    details+="\nF_measure:   %0.3f" % f_measure_macro

    details+='\nWeighted-average metrics: '

    precision_weighted = metrics.precision_score(y_test, actual_pred, average='weighted')
    details+="\nPrecision:   %0.3f" % precision_weighted

    recall_weighted = metrics.recall_score(y_test, actual_pred, average='weighted')
    details+="\nRecall:   %0.3f" % recall_weighted

    f_measure_weighted = metrics.f1_score(y_test, actual_pred, average='weighted')                              
    details+="\nF_measure:   %0.3f" % f_measure_weighted
        
        
    dl_exact_accuracies.append((exact_accuracy,details))
    dl_precisions.append((max(precision_micro,precision_macro,precision_weighted),details))    
    dl_recalls.append((max(recall_micro,recall_macro,recall_weighted),details))
    dl_f_measures.append((max(f_measure_micro,f_measure_macro,f_measure_weighted),details))
    
    i+=1

In [None]:
sorted_accuracies = sorted(dl_exact_accuracies, key=lambda x: x[0])[-10:]
sorted_precisions = sorted(dl_precisions, key=lambda x: x[0])[-10:]
sorted_recalls = sorted(dl_recalls, key=lambda x: x[0])[-10:]
sorted_f_measures = sorted(dl_f_measures, key=lambda x: x[0])[-10:]

print('-'*8 + 'Top 10 accuracies: '+'-'*8)
for value,details in sorted_accuracies:
    print(details)
   
print('-'*8 + 'Top 10 precisions: '+'-'*8)
for value,details in sorted_precisions:
    print(details)  

print('-'*8 + 'Top 10 recalls: '+'-'*8)
for value,details in sorted_recalls:
    print(details)   

print('-'*8 + 'Top 10 f-measures: '+'-'*8)
for value,details in sorted_f_measures:
    print(details) 
