# SVM

By 

A. Ntoumi & A. Steger (University of Groningen, Language Technology Project 2019-20)

In [None]:
## Importing libraries

In [2]:
import pandas as pd
import os, re, string
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from pprint import pprint

from nltk import TweetTokenizer
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score, hamming_loss, precision_score, recall_score
from sklearn import svm


In [3]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

stemmer = SnowballStemmer("dutch")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/stegerakos/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/stegerakos/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Reading the Authors data

In [4]:
# set column names, since they're not setted in the txt file
column_names = ["user_id", "dob", "gender", "polarity", "city", "country", "personality", "other"]

In [5]:
authors = pd.read_csv("csicorpus/List.CSI.AuthorData.1.4.0.BV.2016-02-08.txt", sep="\t", names=column_names)

In [6]:
authors.head()

Unnamed: 0,user_id,dob,gender,polarity,city,country,personality,other
0,60235486,1990,Female,,Antwerpen,Belgium,----,---
1,43931362,1991,Female,,Antwerpen,Belgium,----,---
2,11987873,1988-17-04,Male,Straight,Antwerpen,Belgium,93-30-53-32-22,---
3,98696422,1986-26-07,Male,Straight,Antwerpen,Belgium,90-47-48-22-37,I44-N50-T01-J11
4,36964375,1990,Female,,Antwerpen,Belgium,----,---


In [7]:
## unify the authors datafarame so that we got no duplicate
authors.drop_duplicates(subset='user_id', keep="first", inplace=True)

## Reading the essays data

In [8]:
def extract_user_id(filename):
    """ 
    This is a function that extracts the user_id from a given filename 
    
    Parameters :
    - filename (string) : # 59260694_Essay_2014-06-01.txt
    
    Returns:
    - user_id (string) : 5926069
    """
    return filename.split("_")[0]

In [9]:
extract_user_id("59260694_Essay_2014-06-01.txt")

'59260694'

### Building the essay dataframe

In [10]:
# set the essays path
path = "./csicorpus/essays/"

In [11]:
# set a new empty data list 
data_list = []

#loop over the list of txt files to read their data
for filename in os.listdir(path):
    if filename != ".ipynb_checkpoints":
        # read the txt file inside
        data = open(path+filename, "r").read()
        data_list.append(
            {
                'user_id' : extract_user_id(filename),
                'filename': filename,
                'essay': data
            }
        )

In [12]:
# Build the final dataframe
essays_df = pd.DataFrame(data_list)

In [13]:
# check the built-in dataframe
essays_df.head()

Unnamed: 0,essay,filename,user_id
0,Het orgaandonorschap: een dualistische visie\n...,92419385_Essay_2015-01-06.txt,92419385
1,"Iedereen automatisch geregistreerd als donor, ...",89353110_Essay_2015-01-06.txt,89353110
2,Gezond leven: het middel om langer te leven?\n...,30423159_Essay_2012.txt,30423159
3,﻿Inleiding\nWe kunnen er niet omheen dat het E...,95257163_Paper_2014-06-01.txt,95257163
4,Enkele jaren geleden was men er zeker van dat ...,34322552_Essay_2013-06-01.txt,34322552


In [14]:
# get some insight from the resulted dataframe
essays_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517 entries, 0 to 516
Data columns (total 3 columns):
essay       517 non-null object
filename    517 non-null object
user_id     517 non-null object
dtypes: object(3)
memory usage: 12.2+ KB


As we cen see from the information above, we have exactly around 517 observations, without null values or missed ones.

# Join all the two dataframes

Now that we have extracted all the two dataframes, we proceed to join them by `user_id`, so that for each `user_id` we will keep only their `essays` and `personality`.

### Join between essays and reviews

In [15]:
# convert the user_id type from int to object, so that we can join them between the same columns type 
authors.user_id = authors.user_id.astype(str)

In [18]:
# join Authors and df_inner
df = pd.merge(essays_df[["user_id", "essay"]], authors[["user_id", "personality"]], on='user_id', how='inner')

In [19]:
df.head()

Unnamed: 0,user_id,essay,personality
0,98634123,Voetbal is internationaal een zeer geliefde sp...,24-30-53-74-80
1,98634123,Kan een kind met cochleair implantaat zijn taa...,24-30-53-74-80
2,98634123,Om meer vrouwen aan de top van het bedrijfslev...,24-30-53-74-80
3,81842160,"""Iedereen automatisch geregistreerd als donor,...",41-79-53-17-84
4,81842160,Opgelet: mama en papa lezen mee\nTegenwoordig ...,41-79-53-17-84


## Build the final raw dataframe

In [20]:
# Adding two new columns to the existing dataframe. 
# by default splitting is done on the basis of single space
df[['Openness','Conscientiousness', 'Extroversion', 'Agreeableness', 'Neuroticism']] = df.personality.str.split(pat="-", expand=True)

In [21]:
df.head(6)

Unnamed: 0,user_id,essay,personality,Openness,Conscientiousness,Extroversion,Agreeableness,Neuroticism
0,98634123,Voetbal is internationaal een zeer geliefde sp...,24-30-53-74-80,24.0,30.0,53.0,74.0,80.0
1,98634123,Kan een kind met cochleair implantaat zijn taa...,24-30-53-74-80,24.0,30.0,53.0,74.0,80.0
2,98634123,Om meer vrouwen aan de top van het bedrijfslev...,24-30-53-74-80,24.0,30.0,53.0,74.0,80.0
3,81842160,"""Iedereen automatisch geregistreerd als donor,...",41-79-53-17-84,41.0,79.0,53.0,17.0,84.0
4,81842160,Opgelet: mama en papa lezen mee\nTegenwoordig ...,41-79-53-17-84,41.0,79.0,53.0,17.0,84.0
5,59260694,Beperkt de vrijheid ons of maken beperkingen o...,----,,,,,


In [22]:
## remove rows where personality is empty
df = df[df.personality != "----" ]

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 470 entries, 0 to 474
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   user_id            470 non-null    object
 1   essay              470 non-null    object
 2   personality        470 non-null    object
 3   Openness           470 non-null    object
 4   Conscientiousness  470 non-null    object
 5   Extroversion       470 non-null    object
 6   Agreeableness      470 non-null    object
 7   Neuroticism        470 non-null    object
dtypes: object(8)
memory usage: 33.0+ KB


Now that we have the final datafarame within `essays` and the user personality scores, let's move forward to do feature engineering and Data modeling.

## Text data preprocessing

In [24]:
# get stopword list
stoplist = stopwords.words('dutch') 
# get list of punctuations
punctuations = string.punctuation + "’¶•@°©®™"

In [25]:
def preprocess_text(text):
    """
    This function preprocesses a given raw text by removing the urls, mentions,
    punctuations, stop words, numbers, emojis etc.
    
    @param text string
    @return text string
    """
        
    # string to lowercase
    txt = text.lower()
    
    # keep only ascii characters
    txt = re.sub(r"[^a-zA-ZÀ-ÿ]", " ", txt)
    
    # punctuation removal and map it to space
    translator = str.maketrans(punctuations, " "*len(punctuations))
    s = txt.translate(translator)
    
    # remove digits 
    no_digits = ''.join([i for i in s if not i.isdigit()])
    cleaner = " ".join(no_digits.split())
    
    # tokenize words and removing stop words 
    word_tokens = word_tokenize(cleaner)
    filtered_sentence = [w for w in word_tokens if not w in stoplist]
    filtered_sentence = " ".join(filtered_sentence)
    
    # a stemming word block
    filtered_sentence = [stemmer.stem(word) for word in word_tokenize(filtered_sentence)]
    filtered_sentence = " ".join(filtered_sentence)
    
    return filtered_sentence

In [26]:
# make sure we are dealing with textual data
df.essay = df.essay.astype(str)
# apply the preprocessing function to clean up the text data
df["clean_essay"] = df.essay.apply(preprocess_text)

In [27]:
## Checking data after text preprocessing
df.sample(5)

Unnamed: 0,user_id,essay,personality,Openness,Conscientiousness,Extroversion,Agreeableness,Neuroticism,clean_essay
125,22966612,1.\tINLEIDING\nDe geschiedschrijving van de st...,84-79-22-69-43,84,79,22,69,43,inleid geschiedschrijv stedelijk nederland lan...
334,99317183,VROUW BREEKT NIET DOOR GLAZEN PLAFOND\n\n Rece...,35-97-97-38-4,35,97,97,38,4,vrouw breekt glaz plafond recent onderzoek ku ...
403,89263341,"Toon je groot hart, word donor (Kato De Maerte...",76-13-55-3-43,76,13,55,3,43,ton grot hart word donor kato maertelaer stel ...
469,35362991,"Geluk maak je zelf, dus stop met dat geklaag.\...",65-10-3-3-84,65,10,3,3,84,geluk mak stop geklag jar gaf vlaming gelukk a...
390,90392198,Biofundamentalisme: een angstkreet\nBoodschapp...,84-74-42-57-80,84,74,42,57,80,biofundamentalism angstkret boodschapp doe nat...


## Features engineering

### Label encoding of the `personality` features

In this section we will encode the personality features to a binary 0 and 1, based on a standard threshold of 0.5

In [28]:
# set a list of personalities
personalities = ['Openness', 'Conscientiousness', 'Extroversion', 'Agreeableness', 'Neuroticism']
# loop over them and replace values
for column in personalities:
    # convert them to int first
    df[column] = df[column].astype(int)
    # replace values less than 50 with 0
    df.loc[df[column] < 50, column] = 0
    # replace values over than 50 with 1
    df.loc[df[column] >= 50, column] = 1

In [29]:
# check the sample data again
df.sample(5)

Unnamed: 0,user_id,essay,personality,Openness,Conscientiousness,Extroversion,Agreeableness,Neuroticism,clean_essay
325,83325638,werkloosheidsuitkeringen stoppen is een onmen...,35-17-42-69-0,0,0,0,1,0,werkloosheidsuitker stopp onmens straf nva bep...
212,30158895,Geluk en de Belgen\n\nTegenwooordig zullen mee...,53-52-74-57-90,1,1,1,1,1,geluk belg tegenwooord zull mens bekenn ongelu...
448,78258514,De laatste jaren is het voor de gemiddelde men...,47-3-12-22-7,0,0,0,0,0,laatst jar gemiddeld men sted makkelijker gewo...
145,62557275,Werkloosheidsuitkeringen moeten beperkt worden...,59-6-79-38-55,1,0,1,0,1,werkloosheidsuitker moet beperkt tijd laatst m...
411,21002950,Werkloosheidsuitkeringen moeten beperkt worden...,84-47-70-8-80,1,0,1,0,1,werkloosheidsuitker moet beperkt tijd werkloos...


In [31]:
### Saving the data for next time use cases (LSTM)
df.to_csv("./data/clean_data.csv", index=False)

## Split data in train & test

In this section we will split the data into training and test set, to build our SVM model that will predict the participant's personality

In [31]:
# split the dataframe into train and test
training, test = train_test_split(df, test_size=0.20, random_state=42)

In [32]:
print("We have {} obseravtion in training set, and {} for test set".format(len(training), len(test)))

We have 376 obseravtion in training set, and 94 for test set


In [33]:
# preparing the training splits
train_x = [x for x in training.clean_essay]

train_y_EXT = [x for x in training.Extroversion]
train_y_NEU = [x for x in training.Neuroticism]
train_y_AGR = [x for x in training.Agreeableness]
train_y_CON = [x for x in training.Conscientiousness]
train_y_OPN = [x for x in training.Openness]

# preparing the test splits
test_x = [x for x in test.clean_essay]

test_y_EXT = [x for x in test.Extroversion]
test_y_NEU = [x for x in test.Neuroticism]
test_y_AGR = [x for x in test.Agreeableness]
test_y_CON = [x for x in test.Conscientiousness]
test_y_OPN = [x for x in test.Openness]

## TF-IDF vectorizer

In [34]:
# instantiate a new tfidf vectorizer object
tfidf__vectorizer = TfidfVectorizer()

# transform our corpus to a tfidf matrix
train_x_vectors = tfidf__vectorizer.fit_transform(train_x)
test_x_vectors = tfidf__vectorizer.transform(test_x)

First we need to save the tf-idf vocabulary so we can use it next time with any given document, without needing to load the data at each time and transform it to a tf-idf matrix so we can map our text to it.

In [36]:
# Save the tfidf vocabulary into a pickle file, so wan use it next time
# to execute only once, after it's saved you can load it directly
#pickle.dump(tfidf__vectorizer.vocabulary_, open("./tfidf_vocab/data_vocab.pkl", "wb"))

# SVM with TF-IDF

Before training our model, first we need to do some hyperparameter tuning in order to extract the best parameters to maximize the accuracy

In [38]:
svm_params = {
    'C': [1,2,3,4,5,6,7,8,9,10,11,12], 
    'kernel': ['linear','rbf']
    }

### 1. Extroversion classification

In [40]:
%%time

# get an instance of the SVM model
EXT_model = svm.SVC()

# Making models with hyper parameters sets
CV_svc = GridSearchCV(EXT_model, param_grid=svm_params, n_jobs=-1, cv=5, scoring="f1_micro")

# fitting the model
CV_svc.fit(train_x_vectors, train_y_EXT)

CPU times: user 1 s, sys: 292 ms, total: 1.3 s
Wall time: 4.26 s


GridSearchCV(cv=5, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
                         'kernel': ['linear', 'rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1_micro', verbose=0)

In [41]:
# The best hyperparameter set
print("Best Hyper Parameters is : {}".format(CV_svc.best_params_))
print("Best f1-score is : {} ".format(CV_svc.best_score_))

Best Hyper Parameters is : {'C': 2, 'kernel': 'rbf'}
Best f1-score is : 0.5583859649122808 


### Training the SVM model based on these best Hyperparameters

In [42]:
print("Training Extraversion EXT model with SVM and tf-idf ...")

clf_svm_EXT = CV_svc.best_estimator_
clf_svm_EXT.fit(train_x_vectors, train_y_EXT)

print("Extroversion detection accuracy score: ", clf_svm_EXT.score(test_x_vectors, test_y_EXT))

Training Extraversion EXT model with SVM and tf-idf ...
Extroversion detection accuracy score:  0.5319148936170213


### Micro-average quality metrics

In [43]:
# get the predictions over the test observations
y_pred = clf_svm_EXT.predict(test_x_vectors)

print("F1-micro : {:.4f}".format(f1_score(test_y_EXT, y_pred, average='micro')))
print("Hamming loss : {:.4f}".format(hamming_loss(test_y_EXT,y_pred)))
print("Precision : {:.4f}".format(precision_score(test_y_EXT, y_pred, average='micro')))
print("Recall : {:.4f}".format(recall_score(test_y_EXT, y_pred, average='micro')))

F1-micro : 0.5319
Hamming loss : 0.4681
Precision : 0.5319
Recall : 0.5319


### Classification report

In [44]:
# Classification report of the model
print(classification_report(test_y_EXT, y_pred))

              precision    recall  f1-score   support

           0       0.45      0.21      0.29        42
           1       0.55      0.79      0.65        52

    accuracy                           0.53        94
   macro avg       0.50      0.50      0.47        94
weighted avg       0.51      0.53      0.49        94



In [45]:
### Saving the model for next time predictions
model_path = "./models/EXT_svm_model.pkl"  
with open(model_path, 'wb') as file:  
    pickle.dump(clf_svm_EXT, file)

### 2. Neuroticism classification

In [46]:
%%time

# get an instance of the SVM model
NEU_model = svm.SVC()

# Making models with hyperparameter set
CV_svc = GridSearchCV(NEU_model, param_grid=svm_params, n_jobs=-1, cv=5, scoring="f1_micro")

# fitting the model
CV_svc.fit(train_x_vectors, train_y_NEU)

CPU times: user 1.02 s, sys: 268 ms, total: 1.28 s
Wall time: 4.26 s


GridSearchCV(cv=5, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
                         'kernel': ['linear', 'rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1_micro', verbose=0)

In [47]:
# The best hyperparameter set
print("Best Hyper Parameters is : {}".format(CV_svc.best_params_))
print("Best f1-score is : {} ".format(CV_svc.best_score_))

Best Hyper Parameters is : {'C': 5, 'kernel': 'linear'}
Best f1-score is : 0.5025964912280702 


### Training the SVM model based on these best Hyperparameters

In [48]:
print("Training Neuroticism NEU model with SVM and tf-idf ...")

clf_svm_NEU = CV_svc.best_estimator_
clf_svm_NEU.fit(train_x_vectors, train_y_NEU)

print("Neuroticism detection accuracy score: ", clf_svm_NEU.score(test_x_vectors, test_y_NEU))

Training Neuroticism NEU model with SVM and tf-idf ...
Neuroticism detection accuracy score:  0.5106382978723404


### Micro-average quality metrics

In [49]:
# get the predictions over the test observations
y_pred = clf_svm_NEU.predict(test_x_vectors)

print("F1-micro : {:.4f}".format(f1_score(test_y_NEU, y_pred, average='micro')))
print("Hamming loss : {:.4f}".format(hamming_loss(test_y_NEU,y_pred)))
print("Precision : {:.4f}".format(precision_score(test_y_NEU, y_pred, average='micro')))
print("Recall : {:.4f}".format(recall_score(test_y_NEU, y_pred, average='micro')))

F1-micro : 0.5106
Hamming loss : 0.4894
Precision : 0.5106
Recall : 0.5106


### Classification report

In [50]:
# Classification report of the model
print(classification_report(test_y_NEU, y_pred))

              precision    recall  f1-score   support

           0       0.45      0.43      0.44        42
           1       0.56      0.58      0.57        52

    accuracy                           0.51        94
   macro avg       0.50      0.50      0.50        94
weighted avg       0.51      0.51      0.51        94



In [51]:
### Saving the model for next time predictions
model_path = "./models/NEU_svm_model.pkl"  
with open(model_path, 'wb') as file:  
    pickle.dump(clf_svm_NEU, file)

### 3. Openness classification

In [52]:
%%time

# get an instance of the SVM model
OPN_model = svm.SVC()

# Making models with hyperparameter set
CV_svc = GridSearchCV(OPN_model, param_grid=svm_params, n_jobs=-1, cv=5, scoring="f1_micro")

# fitting the model
CV_svc.fit(train_x_vectors, train_y_OPN)

CPU times: user 1.01 s, sys: 284 ms, total: 1.29 s
Wall time: 4.25 s


GridSearchCV(cv=5, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
                         'kernel': ['linear', 'rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1_micro', verbose=0)

In [53]:
# The best hyperparameter set
print("Best Hyper Parameters is : {}".format(CV_svc.best_params_))
print("Best f1-score is : {} ".format(CV_svc.best_score_))

Best Hyper Parameters is : {'C': 3, 'kernel': 'rbf'}
Best f1-score is : 0.5747017543859648 


### Training the SVM model based on these best Hyperparameters

In [54]:
print("Training Openness OPN model with SVM and tf-idf ...")

clf_svm_OPN = CV_svc.best_estimator_
clf_svm_OPN.fit(train_x_vectors, train_y_OPN)

print("Openness detection accuracy score: ", clf_svm_OPN.score(test_x_vectors, test_y_OPN))

Training Openness OPN model with SVM and tf-idf ...
Openness detection accuracy score:  0.48936170212765956


### Micro-average quality metrics

In [55]:
# get the predictions over the test observations
y_pred = clf_svm_OPN.predict(test_x_vectors)

print("F1-micro : {:.4f}".format(f1_score(test_y_OPN, y_pred, average='micro')))
print("Hamming loss : {:.4f}".format(hamming_loss(test_y_OPN,y_pred)))
print("Precision : {:.4f}".format(precision_score(test_y_OPN, y_pred, average='micro')))
print("Recall : {:.4f}".format(recall_score(test_y_OPN, y_pred, average='micro')))

F1-micro : 0.4894
Hamming loss : 0.5106
Precision : 0.4894
Recall : 0.4894


### Classification report

In [56]:
# Classification report of the model
print(classification_report(test_y_OPN, y_pred))

              precision    recall  f1-score   support

           0       0.43      0.45      0.44        42
           1       0.54      0.52      0.53        52

    accuracy                           0.49        94
   macro avg       0.49      0.49      0.49        94
weighted avg       0.49      0.49      0.49        94



In [57]:
### Saving the model for next time predictions
model_path = "./models/OPN_svm_model.pkl"  
with open(model_path, 'wb') as file:  
    pickle.dump(clf_svm_OPN, file)

### 4. Conscientiousness classification

In [58]:
%%time

# get an instance of the SVM model
CON_model = svm.SVC()

# Making models with hyperparameter set
CV_svc = GridSearchCV(CON_model, param_grid=svm_params, n_jobs=-1, cv=5, scoring="f1_micro")

# fitting the model
CV_svc.fit(train_x_vectors, train_y_CON)

CPU times: user 1.02 s, sys: 284 ms, total: 1.3 s
Wall time: 4.27 s


GridSearchCV(cv=5, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
                         'kernel': ['linear', 'rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1_micro', verbose=0)

In [59]:
# The best hyperparameter set
print("Best Hyper Parameters is : {}".format(CV_svc.best_params_))
print("Best f1-score is : {} ".format(CV_svc.best_score_))

Best Hyper Parameters is : {'C': 2, 'kernel': 'rbf'}
Best f1-score is : 0.5505263157894736 


### Training the SVM model based on these best Hyperparameters

In [60]:
print("Training Conscientiousness CON model with SVM and tf-idf ...")

clf_svm_CON = CV_svc.best_estimator_
clf_svm_CON.fit(train_x_vectors, train_y_CON)

print("Conscientiousness detection accuracy score: ", clf_svm_CON.score(test_x_vectors, test_y_CON))

Training Conscientiousness CON model with SVM and tf-idf ...
Conscientiousness detection accuracy score:  0.5212765957446809


### Micro-average quality metrics

In [61]:
# get the predictions over the test observations
y_pred = clf_svm_CON.predict(test_x_vectors)

print("F1-micro : {:.4f}".format(f1_score(test_y_CON, y_pred, average='micro')))
print("Hamming loss : {:.4f}".format(hamming_loss(test_y_CON,y_pred)))
print("Precision : {:.4f}".format(precision_score(test_y_CON, y_pred, average='micro')))
print("Recall : {:.4f}".format(recall_score(test_y_CON, y_pred, average='micro')))

F1-micro : 0.5213
Hamming loss : 0.4787
Precision : 0.5213
Recall : 0.5213


### Classification report

In [62]:
# Classification report of the model
print(classification_report(test_y_CON, y_pred))

              precision    recall  f1-score   support

           0       0.58      0.50      0.54        52
           1       0.47      0.55      0.51        42

    accuracy                           0.52        94
   macro avg       0.52      0.52      0.52        94
weighted avg       0.53      0.52      0.52        94



In [63]:
### Saving the model for next time predictions
model_path = "./models/CON_svm_model.pkl"  
with open(model_path, 'wb') as file:  
    pickle.dump(clf_svm_CON, file)

### 5. Agreeableness classification

In [64]:
%%time

# get an instance of the SVM model
AGR_model = svm.SVC()

# Making models with hyperparameter set
CV_svc = GridSearchCV(AGR_model, param_grid=svm_params, n_jobs=-1, cv=5, scoring="f1_micro")

# fitting the model
CV_svc.fit(train_x_vectors, train_y_AGR)

CPU times: user 972 ms, sys: 344 ms, total: 1.32 s
Wall time: 4.27 s


GridSearchCV(cv=5, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
                         'kernel': ['linear', 'rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1_micro', verbose=0)

In [65]:
# The best hyperparameter set
print("Best Hyper Parameters is : {}".format(CV_svc.best_params_))
print("Best f1-score is : {} ".format(CV_svc.best_score_))

Best Hyper Parameters is : {'C': 1, 'kernel': 'rbf'}
Best f1-score is : 0.547859649122807 


### Training the SVM model based on these best Hyperparameters

In [66]:
print("Training Agreeableness AGR model with SVM and tf-idf ...")

clf_svm_AGR = CV_svc.best_estimator_
clf_svm_AGR.fit(train_x_vectors, train_y_AGR)

print("Agreeableness detection accuracy score: ", clf_svm_AGR.score(test_x_vectors, test_y_AGR))

Training Agreeableness AGR model with SVM and tf-idf ...
Agreeableness detection accuracy score:  0.4787234042553192


### Micro-average quality metrics

In [67]:
# get the predictions over the test observations
y_pred = clf_svm_AGR.predict(test_x_vectors)

print("F1-micro : {:.4f}".format(f1_score(test_y_AGR, y_pred, average='micro')))
print("Hamming loss : {:.4f}".format(hamming_loss(test_y_AGR,y_pred)))
print("Precision : {:.4f}".format(precision_score(test_y_AGR, y_pred, average='micro')))
print("Recall : {:.4f}".format(recall_score(test_y_AGR, y_pred, average='micro')))

F1-micro : 0.4787
Hamming loss : 0.5213
Precision : 0.4787
Recall : 0.4787


### Classification report

In [68]:
# Classification report of the model
print(classification_report(test_y_AGR, y_pred))

              precision    recall  f1-score   support

           0       0.56      0.69      0.62        58
           1       0.22      0.14      0.17        36

    accuracy                           0.48        94
   macro avg       0.39      0.41      0.39        94
weighted avg       0.43      0.48      0.45        94



In [69]:
### Saving the model for next time predictions
model_path = "./models/AGR_svm_model.pkl"  
with open(model_path, 'wb') as file:  
    pickle.dump(clf_svm_AGR, file)

## Combining the 5 built-in models

After saving the tf-idf vocabulary, we can use it next time with any given document, without needing to load the data at each time and transform it to a tf-idf matrix so we can fit our text to it.

In [70]:
# Load the vocabulary later
loaded_doc_vec = TfidfVectorizer(decode_error="replace", vocabulary=pickle.load(open("./tfidf_vocab/data_vocab.pkl", "rb")))

Now that we have saved and loaded our vocabulary, we need to build a function taht will take as parameter a raw text to preprocess it and map it to the built-in vocabulary so we can call our trained models to extract the right personalities from it.

In [71]:
def predict_five_personality_traits(essay):
    # set the constant path variables 
    path = "./models/"
    suffix = "_svm_model.pkl"
    
    # first let us load the built-in SVM models
    with open(path+"AGR"+suffix, 'rb') as file:  
        AGR_model = pickle.load(file)
    
    with open(path+"CON"+suffix, 'rb') as file:  
        CON_model = pickle.load(file)
    
    with open(path+"EXT"+suffix, 'rb') as file:  
        EXT_model = pickle.load(file)
    
    with open(path+"NEU"+suffix, 'rb') as file:  
        NEU_model = pickle.load(file)
    
    with open(path+"OPN"+suffix, 'rb') as file:  
        OPN_model = pickle.load(file)
    
    # second, we need to build the feature matrix
    transformer = TfidfTransformer()
    features = transformer.fit_transform(loaded_doc_vec.fit_transform([preprocess_text(essay)]))
    
    # now that we have the features, we will get the predictions of the 5 models
    AGR_predictions = int(AGR_model.predict(features.toarray()))
    CON_prediction = int(CON_model.predict(features.toarray()))
    EXT_prediction = int(EXT_model.predict(features.toarray()))
    NEU_prediction = int(NEU_model.predict(features.toarray()))
    OPN_prediction = int(OPN_model.predict(features.toarray()))
    
    # build the final dictionary with prediction
    final_prediction = {
        "Openness": OPN_prediction,
        "Conscientiousness": CON_prediction,
        "Extroversion": EXT_prediction,
        "Agreeableness": AGR_predictions,
        "Neuroticism": NEU_prediction
    }
    
    return final_prediction

In [78]:
# let us get a text as a sample to test on
test_text = df.essay[22]
# run predictions over that text
res = predict_five_personality_traits(test_text)

pprint(res)

{'Agreeableness': 1,
 'Conscientiousness': 0,
 'Extroversion': 1,
 'Neuroticism': 1,
 'Openness': 1}


### Above we proposed a possible solution for Big Five Personality detection in a Dutch corpus