# Requirements

In [1]:
# Required libraries.
import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt
import spacy
import string
import nltk
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings("ignore")
from spacy import tokenizer
from spacy.lang.en import English
nlp = English()
tkz = tokenizer.Tokenizer(nlp.vocab)
from tqdm import tqdm
from nltk.tokenize import sent_tokenize
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import re
from bs4 import BeautifulSoup
import plotly.io as pio
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import unicodedata
import pickle

# The Model(s)

In [2]:
embedding = pd.read_pickle('/Users/simonefacchiano/Desktop/Data Science/SL/Project/embedding_100k.pkl')

embedding = embedding[['post', 'age', 'gender', 'new_length', 'embedding']]

### Fix the labels for the model

In [3]:
# Notice: there is a little imbalancement here, due to the fact that we are working with the *first* 100k rows of the original dataset. Keep this in mind

embedding['age_class'] = pd.cut(
        embedding["age"],
        bins=[12, 18, 28, 50],
        labels=[0, 1, 2]
    ).astype("int")

In [4]:
embedding['gender'] = embedding['gender'].map({'male': 1, 'female': 0})

In [5]:
# Select and re-order the columns for a better visualization:
embedding = embedding[['post', 'embedding', 'new_length', 'gender', 'age_class']]

In [6]:
embedding.head()

Unnamed: 0,post,embedding,new_length,gender,age_class
0,ooh shiny new commenting,"[-0.18770814, -0.13998552, 0.019073976, 0.0225...",4,0,0
1,today parade suked wasnt bad band year battle ...,"[-0.110725485, 0.043310776, 0.020091565, -0.06...",23,1,0
2,know anymore concerned everyday want bold face...,"[-0.08226248, -0.03889151, 0.029810807, 0.0900...",38,0,1
3,roof sunset posted paul,"[-0.046943568, 0.13227944, 0.02199171, 0.00461...",4,1,1
4,god love nanny absolutely greatest woman earth...,"[-0.04256986, -0.072457716, 0.005980614, 0.014...",279,0,1


## Predicting the Gender of the author

In [7]:
# Let's first identify the feature matrix and the labels

X = embedding['embedding'].to_list()
y = embedding['gender'].to_list()

In [8]:
# Create the train & validation sets
# In this way, every operation of fine tuning will depend on the data observed in the validation, while the final assessment for the quality of the model will be made on the test set

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, stratify = y) # stratification on the y to avoid class imbalancement

print(f'The train set has {len(X_train)} rows, while the validation set contains {len(X_val)} rows: 80% vs 20%.')

The train set has 80000 rows, while the validation set contains 20000 rows: 80% vs 20%.


In [9]:
# We start with a very simple Logistic Regression to see how it works

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train, y_train)

LogisticRegression()

In [10]:
from sklearn import metrics

predicted = lr.predict(X_val)

print(f'Logistic Regression Accuracy : {metrics.accuracy_score(y_val, predicted)}')
print(f'Logistic Regression Precision: {metrics.precision_score(y_val, predicted)}')
print(f'Logistic Regression Recall   : {metrics.recall_score(y_val, predicted)}')

####

Logistic Regression Accuracy : 0.645
Logistic Regression Precision: 0.6502995188058529
Logistic Regression Recall   : 0.6517075091034347


## Confrontare diversi modelli

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate

#  defining models and associated parameters
models = [RandomForestClassifier(n_estimators = 100, max_depth=5, random_state=42), 
          LinearSVC(random_state=42),
          LogisticRegression(random_state=42)]

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1) # With StratifiedKFold, the folds are made by preserving the percentage of samples for each class.

scoring = ['accuracy', 'f1_macro', 'recall_macro', 'precision_macro']

#  iterative loop print metrics from each model
for model in tqdm(models):
    model_name = model.__class__.__name__
    result = cross_validate(model, X_train, y_train, cv=kf, scoring=scoring)
    print("%s: Mean Accuracy = %.2f%%; Mean F1-macro = %.2f%%; Mean recall-macro = %.2f%%; Mean precision-macro = %.2f%%" 
          % (model_name, 
             result['test_accuracy'].mean()*100, 
             result['test_f1_macro'].mean()*100, 
             result['test_recall_macro'].mean()*100, 
             result['test_precision_macro'].mean()*100))

 33%|███▎      | 1/3 [03:15<06:31, 195.59s/it]

RandomForestClassifier: Mean Accuracy = 61.73%; Mean F1-macro = 61.65%; Mean recall-macro = 61.67%; Mean precision-macro = 61.74%


 67%|██████▋   | 2/3 [03:42<01:36, 96.22s/it] 

LinearSVC: Mean Accuracy = 64.03%; Mean F1-macro = 64.02%; Mean recall-macro = 64.03%; Mean precision-macro = 64.03%


100%|██████████| 3/3 [04:06<00:00, 82.03s/it]

LogisticRegression: Mean Accuracy = 63.95%; Mean F1-macro = 63.95%; Mean recall-macro = 63.95%; Mean precision-macro = 63.95%



