In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

from sklearn import model_selection, preprocessing, linear_model, metrics, naive_bayes, svm
from sklearn import decomposition, ensemble

from sklearn.decomposition import TruncatedSVD, PCA

import textblob, string, xgboost

from os import listdir
from os.path import isfile, join

from collections import Counter

import numpy as np

import matplotlib.pyplot as plt

from wordcloud import WordCloud
from nltk.corpus import stopwords

import nltk
nltk.download('stopwords')

import gensim
from gensim.models import Word2Vec

  from numpy.core.umath_tests import inner1d


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pvashisth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!




In [2]:
pos_path = "C:\\Users\\pvashisth\\Desktop\\ai_course\\Git\\twitter-gender-classification\\src\\Data\\pos"
neg_path = "C:\\Users\\pvashisth\\Desktop\\ai_course\\Git\\twitter-gender-classification\\src\\Data\\neg"

In [3]:

data = [] # text feature
target = [] #labels to predict ml result

In [4]:
pos_data = [] # male
neg_data = [] # female

In [5]:
for file in listdir(pos_path):
    with open(join(pos_path, file), 'r', encoding='utf-8') as r:
        d = r.read()
        data.append(d)
        target.append(1)
        
        pos_data.append(d)

In [6]:
len(data), len(target), len(pos_data)

(9762, 9762, 9762)

In [7]:
for file in listdir(neg_path):
    with open(join(neg_path, file), 'r', encoding='utf-8') as r:
        d = r.read()
        data.append(d)
        target.append(0)
        
        neg_data.append(d)

In [8]:
len(data), len(target), len(neg_data)

(21021, 21021, 11259)

In [9]:
Counter(target)

Counter({1: 9762, 0: 11259})

# Machine learning - Base line (TFIDF)

In [77]:
X_train, X_test, Y_train, Y_test = train_test_split(data, target, test_size=0.2, random_state=42)

In [103]:
len(X_train)

16816

In [78]:
vect = TfidfVectorizer(stop_words='english')

In [79]:
tokens = vect.fit_transform(X_train)

In [80]:
tokens.shape

(16816, 12778)

## Logistic Regression (LR)

In [81]:
log_model = LogisticRegression(solver='sag', max_iter=1000)

In [82]:
log_model.fit(tokens, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='sag', tol=0.0001,
          verbose=0, warm_start=False)

In [83]:
test_tokens = vect.transform(X_test)

In [105]:
test_tokens.shape

(4205, 12778)

In [104]:
len(X_test)

4205

In [84]:
predictions = log_model.predict(test_tokens)

In [85]:
# precision but we have taken the accuracy measures becuase dataset is balanced
precision_recall_fscore_support(Y_test, predictions)

(array([0.55049355, 0.51304901]),
 array([0.65462754, 0.40502513]),
 array([0.59806146, 0.45268183]),
 array([2215, 1990], dtype=int64))

In [86]:
accuracy_score(Y_test, predictions)

0.5365041617122474

## Multi Layer Perceptron (MLP) - Neural Network

In [20]:
mlp_model = MLPClassifier(max_iter=100)

In [21]:
mlp_model.fit(tokens, Y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=100, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [22]:
predictions = mlp_model.predict(test_tokens)

In [23]:
precision_recall_fscore_support(Y_test, predictions)

(array([0.51427212, 0.46077033]),
 array([0.48803612, 0.48693467]),
 array([0.50081075, 0.47349133]),
 array([2215, 1990], dtype=int64))

In [24]:
accuracy_score(Y_test, predictions)

0.4875148632580262

## Support Vector Machine (SVM)

In [25]:
svm_model = SVC()

In [26]:
svm_model.fit(tokens, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [27]:
# x_test = test_tokens, x_train = tokens

predictions = svm_model.predict(test_tokens)

In [28]:
precision_recall_fscore_support(Y_test, predictions)

  'precision', 'predicted', average, warn_for)


(array([0.52675386, 0.        ]),
 array([1., 0.]),
 array([0.69003115, 0.        ]),
 array([2215, 1990], dtype=int64))

In [29]:
accuracy_score(Y_test, predictions)

0.5267538644470868

In [30]:
# reusable code from Analytics Vidya
#https://www.analyticsvidhya.com/blog/2018/04/a-comprehensive-guide-to-understand-and-implement-text-classification-in-python/

def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False, batch_size=32):
    # fit the training dataset on the classifier
    if is_neural_net == False:
        classifier.fit(feature_vector_train, label)
    else:
        classifier.fit(feature_vector_train, label, batch_size=batch_size)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = [int(round(p[0])) for p in predictions]
    
    return metrics.accuracy_score(Y_test, predictions)

## Naive Bayes 

In [31]:
# Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), tokens, Y_train, test_tokens)
print("NB, WordLevel TF-IDF: ", accuracy)

NB, WordLevel TF-IDF:  0.5384066587395957


## Random Forest

In [32]:
# RF on Word Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), tokens, Y_train, test_tokens)
print("RF, WordLevel TF-IDF: ", accuracy)

RF, WordLevel TF-IDF:  0.47705112960761


## XG Boost

In [33]:
# Extereme Gradient Boosting on Word Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), tokens.tocsc(), Y_train, test_tokens.tocsc())
print("Xgb, WordLevel TF-IDF: ", accuracy)

Xgb, WordLevel TF-IDF:  0.549346016646849


  if diff:


# Word Embeddings - W2vec

In [34]:
wv = gensim.models.KeyedVectors.load_word2vec_format("C:\\Users\\pvashisth\\Downloads\\GoogleNewsVectors\\GoogleNews-vectors-negative300.bin", binary=True)
wv.init_sims(replace=True)

In [35]:
len(wv.syn0norm[wv.vocab["men"].index])

  """Entry point for launching an IPython kernel.


300

In [87]:
len(wv.vocab)

314538

In [88]:
## kd nuggets ref
#https://www.kdnuggets.com/2018/11/multi-class-text-classification-model-comparison-selection.html/2

def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.vocab:
            mean.append(wv.syn0norm[wv.vocab[word].index])
            all_words.add(wv.vocab[word].index)

    if not mean:
        #raise Exception("All words in a sentence not present in word2vec vocabulary.")
        return np.zeros(wv.vector_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def  word_averaging_sentences(wv, sentences):
    print("Total sentences: {0}".format(len(sentences)))
    
    avgd_vectors = []
    for sentence in sentences:
        avgd_vector = word_averaging(wv, sentence)
        avgd_vectors.append(avgd_vector)
        
    print("Total converted: {0}".format(len(avgd_vectors)))
    return np.vstack(avgd_vectors)

In [89]:
#https://www.kdnuggets.com/2018/11/multi-class-text-classification-model-comparison-selection.html/2

def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens

In [90]:
X_train_tokenized = []
X_test_tokenized = []

In [91]:
for xtr in X_train:
    X_train_tokenized.append(w2v_tokenize_text(xtr))

for xte in X_test:
    X_test_tokenized.append(w2v_tokenize_text(xte))

len(X_train_tokenized), len(X_test_tokenized), len(X_train), len(X_test)

(16816, 4205, 16816, 4205)

In [92]:
X_train_tokenized[0:50]

[['Klopp',
  '``',
  'It',
  "'s",
  'good',
  'to',
  'be',
  'back',
  'asked',
  'for',
  'pretzels',
  'for',
  'dinner',
  'because',
  'you',
  'ca',
  "n't",
  'really',
  'get',
  'them',
  'in',
  'England',
  "''"],
 ['Melbourne',
  'Showtime',
  "'ve",
  'never',
  'been',
  'so',
  'exhausted',
  'before',
  'but',
  "'m",
  'still',
  'determined',
  'to',
  'be',
  'as',
  'fun',
  'and',
  'entertaining',
  'as',
  'p…'],
 ['honestly',
  'find',
  'this',
  'fine',
  'when',
  'someone',
  'does',
  'it',
  'it',
  'not',
  'like',
  'you',
  'can',
  'spend',
  'everyday',
  'talking',
  'to',
  'everyone..',
  'you',
  'need',
  'to',
  'kn…'],
 ['The',
  'only',
  'reason',
  "'ve",
  'been',
  'going',
  'out',
  'with',
  'this',
  'guy',
  'all',
  'summer',
  'is',
  'because',
  'have',
  'no',
  'idea',
  'how',
  'to',
  'operate',
  'my',
  'gas',
  'grill'],
 ['Thanks',
  'also',
  'to',
  'Dr',
  'amp',
  'for',
  'looking',
  'at',
  'the',
  'use',
  'of',

In [93]:
X_train_avg = word_averaging_sentences(wv, X_train_tokenized)
X_test_avg = word_averaging_sentences(wv, X_test_tokenized)

Total sentences: 16816


  # This is added back by InteractiveShellApp.init_path()


Total converted: 16816
Total sentences: 4205
Total converted: 4205


In [94]:
## Logistic Regression (LR)

In [95]:
logreg = LogisticRegression(solver='sag', max_iter=1000)
logreg = logreg.fit(X_train_avg, Y_train)
y_pred = logreg.predict(X_test_avg)
print('accuracy %s' % accuracy_score(y_pred, Y_test))
#print(classification_report(Y_test, y_pred, target_names=['female', 'male']))

accuracy 0.5443519619500594


In [None]:
## Support Vector Machine (SVM)

In [44]:
svm_model = SVC()

svm_model.fit(X_train_avg, Y_train)

predictions = svm_model.predict(X_test_avg)

precision_recall_fscore_support(Y_test, predictions)

accuracy_score(Y_test, predictions)

  'precision', 'predicted', average, warn_for)


0.5267538644470868

In [None]:
# 

In [49]:
# reusable code from Analytics Vidya
#https://www.analyticsvidhya.com/blog/2018/04/a-comprehensive-guide-to-understand-and-implement-text-classification-in-python/

def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False, batch_size=32):
    # fit the training dataset on the classifier
    if is_neural_net == False:
        classifier.fit(feature_vector_train, label)
    else:
        classifier.fit(feature_vector_train, label, batch_size=batch_size)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = [int(round(p[0])) for p in predictions]
    
    return metrics.accuracy_score(Y_test, predictions)

In [None]:
# Random Forest (RF)

In [50]:

# RF on Word Level on Word embedding (W2Vec)
accuracy = train_model(ensemble.RandomForestClassifier(), X_train_avg, Y_train, X_test_avg)
print("RF, WordLevel TF-IDF: ", accuracy)


RF, WordLevel TF-IDF:  0.47728894173602854


In [None]:
# XG Boost

In [51]:
# Extereme Gradient Boosting on Word embedding (W2Vec)
accuracy = train_model(xgboost.XGBClassifier(), X_train_avg, Y_train, X_test_avg)
print("Xgb, WordLevel TF-IDF: ", accuracy)

Xgb, WordLevel TF-IDF:  0.5538644470868014


  if diff:


# Word Embeddings - Glove

In [52]:
wv = gensim.models.KeyedVectors.load_word2vec_format("C:\\Users\\pvashisth\\Downloads\\glove-twitter-200.gz", binary=True, encoding='latin-1')
wv.init_sims(replace=True)

  self.vectors[i, :] /= sqrt((self.vectors[i, :] ** 2).sum(-1))
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)


In [55]:
len(wv.vocab)

314538

In [56]:
def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.vocab:
            mean.append(wv.syn0norm[wv.vocab[word].index])
            all_words.add(wv.vocab[word].index)

    if not mean:
        #raise Exception("All words in a sentence not present in word2vec vocabulary.")
        return np.zeros(wv.vector_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def  word_averaging_sentences(wv, sentences):
    print("Total sentences: {0}".format(len(sentences)))
    
    avgd_vectors = []
    for sentence in sentences:
        avgd_vector = word_averaging(wv, sentence)
        avgd_vectors.append(avgd_vector)
        
    print("Total converted: {0}".format(len(avgd_vectors)))
    return np.vstack(avgd_vectors)

In [57]:
def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens

In [64]:
len(wv.syn0norm[wv.vocab["man"].index])

  """Entry point for launching an IPython kernel.


200

In [65]:
X_train_tokenized = []
X_test_tokenized = []

In [66]:
for xtr in X_train:
    X_train_tokenized.append(w2v_tokenize_text(xtr))

for xte in X_test:
    X_test_tokenized.append(w2v_tokenize_text(xte))

len(X_train_tokenized), len(X_test_tokenized), len(X_train), len(X_test)

(16816, 4205, 16816, 4205)

In [67]:
X_train_tokenized[0:50]

[['Klopp',
  '``',
  'It',
  "'s",
  'good',
  'to',
  'be',
  'back',
  'asked',
  'for',
  'pretzels',
  'for',
  'dinner',
  'because',
  'you',
  'ca',
  "n't",
  'really',
  'get',
  'them',
  'in',
  'England',
  "''"],
 ['Melbourne',
  'Showtime',
  "'ve",
  'never',
  'been',
  'so',
  'exhausted',
  'before',
  'but',
  "'m",
  'still',
  'determined',
  'to',
  'be',
  'as',
  'fun',
  'and',
  'entertaining',
  'as',
  'p…'],
 ['honestly',
  'find',
  'this',
  'fine',
  'when',
  'someone',
  'does',
  'it',
  'it',
  'not',
  'like',
  'you',
  'can',
  'spend',
  'everyday',
  'talking',
  'to',
  'everyone..',
  'you',
  'need',
  'to',
  'kn…'],
 ['The',
  'only',
  'reason',
  "'ve",
  'been',
  'going',
  'out',
  'with',
  'this',
  'guy',
  'all',
  'summer',
  'is',
  'because',
  'have',
  'no',
  'idea',
  'how',
  'to',
  'operate',
  'my',
  'gas',
  'grill'],
 ['Thanks',
  'also',
  'to',
  'Dr',
  'amp',
  'for',
  'looking',
  'at',
  'the',
  'use',
  'of',

In [68]:
X_train_avg = word_averaging_sentences(wv, X_train_tokenized)
X_test_avg = word_averaging_sentences(wv, X_test_tokenized)

Total sentences: 16816


  


Total converted: 16816
Total sentences: 4205
Total converted: 4205


In [None]:
## Logistic Regression (LR)

In [69]:
logreg = LogisticRegression(solver='sag', max_iter=1000)
logreg = logreg.fit(X_train_avg, Y_train)
y_pred = logreg.predict(X_test_avg)
print('accuracy %s' % accuracy_score(y_pred, Y_test))
#print(classification_report(Y_test, y_pred, target_names=['female', 'male']))

accuracy 0.5443519619500594


In [None]:
## Support Vector Machine (SVM)

In [70]:
svm_model = SVC()

svm_model.fit(X_train_avg, Y_train)

predictions = svm_model.predict(X_test_avg)

precision_recall_fscore_support(Y_test, predictions)

accuracy_score(Y_test, predictions)

  'precision', 'predicted', average, warn_for)


0.5267538644470868

In [None]:
### Reusable code from Analytics Vidya

In [1]:
#https://www.analyticsvidhya.com/blog/2018/04/a-comprehensive-guide-to-understand-and-implement-text-classification-in-python/

def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False, batch_size=32):
    # fit the training dataset on the classifier
    if is_neural_net == False:
        classifier.fit(feature_vector_train, label)
    else:
        classifier.fit(feature_vector_train, label, batch_size=batch_size)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = [int(round(p[0])) for p in predictions]
    
    return metrics.accuracy_score(Y_test, predictions)

In [None]:
### Random Forest on Word Level TF IDF Vectors

In [73]:
accuracy = train_model(ensemble.RandomForestClassifier(), X_train_avg, Y_train, X_test_avg)
print("RF, WordLevel TF-IDF: ", accuracy)


RF, WordLevel TF-IDF:  0.48466111771700354


In [74]:
# Extereme Gradient Boosting on Word Level TF IDF Vectors

In [75]:
accuracy = train_model(xgboost.XGBClassifier(), X_train_avg, Y_train, X_test_avg)
print("Xgb, WordLevel TF-IDF: ", accuracy)

Xgb, WordLevel TF-IDF:  0.5239001189060642


  if diff:


# END