# BUILDING SVM MODELS TO DETECT BIG FIVE PERSONALITY TRAITS IN A DUTCH CORPUS

## First, we start with importing the necessary libraries

In [7]:
import pandas as pd
import os, re, string
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from pprint import pprint

# the below line is the major difference to file "csi_svm_kp"
from nltk.stem.snowball import SnowballStemmer

from nltk import TweetTokenizer
from nltk.corpus import stopwords
from nltk import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score, hamming_loss, precision_score, recall_score
from sklearn import svm

In [8]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

stemmer = SnowballStemmer("dutch")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/stegerakos/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/stegerakos/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Inspecting the data

In [9]:
# We will assign names to the columns, since the data came without these
col = ["Participant", "Born", "Gender", "LGBT", "Residence", "Country", "BigFive", "MBTI"]

In [10]:
subjects = pd.read_csv("csicorpus/List.CSI.AuthorData.1.4.0.BV.2016-02-08.txt", sep = "\t", names = col)

In [11]:
subjects.head(5)

Unnamed: 0,Participant,Born,Gender,LGBT,Residence,Country,BigFive,MBTI
0,60235486,1990,Female,,Antwerpen,Belgium,----,---
1,43931362,1991,Female,,Antwerpen,Belgium,----,---
2,11987873,1988-17-04,Male,Straight,Antwerpen,Belgium,93-30-53-32-22,---
3,98696422,1986-26-07,Male,Straight,Antwerpen,Belgium,90-47-48-22-37,I44-N50-T01-J11
4,36964375,1990,Female,,Antwerpen,Belgium,----,---


In [12]:
# We eliminate duplicates
subjects.drop_duplicates(subset='Participant', keep='first', inplace=True)

In [13]:
len(subjects)

668

In [14]:
# We devise a function that extracts user ID from a file name
def user_id(filename):
    return filename.split("_")[0]

user_id("59260694_Essay_2014-06-01.txt")

'59260694'

In [15]:
# Fixing the path
path = "./csicorpus/essays/" 
# Setting up a new empty data list here, into which we will place our txt files
data_list = [] 

In [16]:
# A for-loop reads the data of the txt files

for filename in os.listdir(path):
    if filename != ".ipynb_checkpoints":
        data = open(path+filename, "r").read()
        data_list.append(
            {
            'Participant': user_id(filename),
            'filename': filename,
            'Essay': data
            }
        )

In [17]:
essays = pd.DataFrame(data_list)

essays.head()

Unnamed: 0,Participant,filename,Essay
0,92419385,92419385_Essay_2015-01-06.txt,Het orgaandonorschap: een dualistische visie\n...
1,89353110,89353110_Essay_2015-01-06.txt,"Iedereen automatisch geregistreerd als donor, ..."
2,30423159,30423159_Essay_2012.txt,Gezond leven: het middel om langer te leven?\n...
3,95257163,95257163_Paper_2014-06-01.txt,﻿Inleiding\nWe kunnen er niet omheen dat het E...
4,34322552,34322552_Essay_2013-06-01.txt,Enkele jaren geleden was men er zeker van dat ...


In [18]:
essays.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517 entries, 0 to 516
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Participant  517 non-null    object
 1   filename     517 non-null    object
 2   Essay        517 non-null    object
dtypes: object(3)
memory usage: 12.2+ KB


Above we can see that we succesfully extracted the essay data from the files into a dataframe

## Cleaning our data

First, we will have to merge our two dataframes: `subjects` and `essays` before going on to the preprocessing task.

In [19]:
# Making sure 'Participants' are strings
subjects.Participant = subjects.Participant.astype(str) 

In [20]:
data = pd.merge(essays[["Participant", "Essay"]], subjects[["Participant", "BigFive"]])

In [21]:
data.head()

Unnamed: 0,Participant,Essay,BigFive
0,92419385,Het orgaandonorschap: een dualistische visie\n...,84-58-95-6-27
1,89353110,"Iedereen automatisch geregistreerd als donor, ...",1-41-70-8-80
2,30423159,Gezond leven: het middel om langer te leven?\n...,1-86-4-50-27
3,30423159,Werken tot 65 jaar zorgt voor meer psychische ...,1-86-4-50-27
4,30423159,In 2012 is er een wet doorgekomen die het drag...,1-86-4-50-27


In [22]:
len(data)

475

In [23]:
data[['Openness', 'Conscientiousness', 'Extroversion', 'Agreeableness', 'Neuroticism']] = data.BigFive.str.split(pat= "-", expand = True)

In [24]:
# Eliminating NULL values
data = data[data.BigFive != "----"]

In [25]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 470 entries, 0 to 474
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Participant        470 non-null    object
 1   Essay              470 non-null    object
 2   BigFive            470 non-null    object
 3   Openness           470 non-null    object
 4   Conscientiousness  470 non-null    object
 5   Extroversion       470 non-null    object
 6   Agreeableness      470 non-null    object
 7   Neuroticism        470 non-null    object
dtypes: object(8)
memory usage: 33.0+ KB


The preprocessing will be defined as a single function:

In [26]:
def preprocess(text):
    
    stoplist = stopwords.words('dutch') # define stop words
    punctuations = string.punctuation + "’¶•@°©®™" # define punctuations
    
    txt = text.lower() # puts text in lowercase
    txt = re.sub(r"[^a-zA-ZÀ-ÿ]", " ", txt)
    
    translator = str.maketrans(punctuations, " "*len(punctuations))
    s = txt.translate(translator)  # s is a variable that contains the data freed from punctuation
    
    no_digits= ''.join([i for i in s if not i.isdigit()])
    cleaned = " ".join(no_digits.split())
    
    word_tokens = word_tokenize(cleaned)
    filtered_sentence = [w for w in word_tokens if not w in stoplist]
    filtered_sentence = " ".join(filtered_sentence)
    
    filtered_sentence = [stemmer.stem(word) for word in word_tokenize(filtered_sentence)]
    filtered_sentence = " ".join(filtered_sentence)
    
    return filtered_sentence

In [27]:
data.Essay = data.Essay.astype(str)

In [28]:
data["clean_essay"] = data.Essay.apply(preprocess)

In [29]:
len(data)

470

In [30]:
# defining the cut-off points for our binary SVM to predict: if a certain personality 
# trait is 50 or higher, it will be assigned a 1, otherwise it will get a 0 if the value
# is below


ocean = ["Openness", "Conscientiousness", "Extroversion", "Agreeableness", "Neuroticism"]

for column in ocean:
    data[column] = data[column].astype(int)
    data.loc[data[column] < 50, column] = 0
    data.loc[data[column] >= 50, column] = 1

In [31]:
data.head(5)

Unnamed: 0,Participant,Essay,BigFive,Openness,Conscientiousness,Extroversion,Agreeableness,Neuroticism,clean_essay
0,92419385,Het orgaandonorschap: een dualistische visie\n...,84-58-95-6-27,1,1,1,0,0,orgaandonorschap dualistisch visie donker hang...
1,89353110,"Iedereen automatisch geregistreerd als donor, ...",1-41-70-8-80,0,0,1,0,1,iederen automatisch geregistreerd donor liever...
2,30423159,Gezond leven: het middel om langer te leven?\n...,1-86-4-50-27,0,1,0,1,0,gezond lev middel langer lev tijdschrift zoal ...
3,30423159,Werken tot 65 jaar zorgt voor meer psychische ...,1-86-4-50-27,0,1,0,1,0,werk jar zorgt psychisch problem over beslist ...
4,30423159,In 2012 is er een wet doorgekomen die het drag...,1-86-4-50-27,0,1,0,1,0,wet doorgekom drag levensbeschouw teken verbie...


In [32]:
# saving the data into a csv file so that we can reuse it in building the LSTM
data.to_csv("./data/clean_data_ltp.csv")

## Data splitting

We will be splitting data into 80% train and 20% test sets:

In [33]:
train, test = train_test_split(data, test_size=0.20, random_state= 42) # any good reason why 20%

In [34]:
print("Training set: {} observations; Test set: {} observations".format(len(train), len(test)))

Training set: 376 observations; Test set: 94 observations


In [35]:
train_x = [x for x in train.clean_essay]

train_y_OPN = [x for x in train.Openness]
train_y_CON = [x for x in train.Conscientiousness]
train_y_EXT = [x for x in train.Extroversion]
train_y_AGR = [x for x in train.Agreeableness]
train_y_NEU = [x for x in train.Neuroticism]

test_x = [x for x in test.clean_essay]

test_y_OPN = [x for x in test.Openness]
test_y_CON = [x for x in test.Conscientiousness]
test_y_EXT = [x for x in test.Extroversion]
test_y_AGR = [x for x in test.Agreeableness]
test_y_NEU = [x for x in test.Neuroticism]

## Building an SVM with a TF-IDF vectorizer

In [36]:
tfidf = TfidfVectorizer()

train_x_vec = tfidf.fit_transform(train_x)
test_x_vec = tfidf.transform(test_x)

In [37]:
# Saving our TF-IDF vocabulary, so that we can call it later
pickle.dump(tfidf.vocabulary_, open("./tfidf_vocab/svm_vocab.pkl", "wb"))

In [38]:
# Using GridSearchCV, we will conduct hyperparameter search to 
# determine the best parameters. Below we define the grid we are
# searching in:

grid = {
    'C': [1,2,3,4,5,6,7,8,9,10,11,12],
    'kernel': ['linear', 'rbf']
}

### Openness

In [39]:
# First we will conduct the hyperparameter search

OPN_model = svm.SVC()

CV_svc = GridSearchCV(OPN_model, param_grid= grid, n_jobs =-1, scoring="f1_micro")
CV_svc.fit(train_x_vec, train_y_OPN)

GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
                         'kernel': ['linear', 'rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1_micro', verbose=0)

In [40]:
print("Best hyperparameters for Openness: {}".format(CV_svc.best_params_))
print("Best F1-score for Openness: {}".format(CV_svc.best_score_))

Best hyperparameters for Openness: {'C': 3, 'kernel': 'rbf'}
Best F1-score for Openness: 0.5477543859649123


In [41]:
# Next, we will train the SVMs on the gained values

clf_svm_OPN = CV_svc.best_estimator_
clf_svm_OPN.fit(train_x_vec, train_y_OPN)

print("Accuracy score in detecting Open personality trait: ", clf_svm_OPN.score(test_x_vec, test_y_OPN))

Accuracy score in detecting Open personality trait:  0.5638297872340425


In [42]:
# Checking predictions over test data

y_pred = clf_svm_OPN.predict(test_x_vec)

print("F1-micro: {:.4f}".format(f1_score(test_y_OPN, y_pred, average = "micro")))
print("Precision: {:.4f}".format(precision_score(test_y_OPN, y_pred)))
print("Recall: {:.4f}".format(recall_score(test_y_OPN, y_pred, average = "micro")))

F1-micro: 0.5638
Precision: 0.5000
Recall: 0.5638


In [43]:
# Displaying the classification report:

print(classification_report(test_y_OPN, y_pred))

              precision    recall  f1-score   support

           0       0.65      0.49      0.56        53
           1       0.50      0.66      0.57        41

    accuracy                           0.56        94
   macro avg       0.57      0.57      0.56        94
weighted avg       0.58      0.56      0.56        94



In [44]:
# Saving the model to use in our pipeline

model_path = "./models/OPN_svm_model.pkl"  
with open(model_path, 'wb') as file:  
    pickle.dump(clf_svm_OPN, file)

Below the same process will be conducted over each of the remaining personality traits:

### Conscientiousness

In [45]:
CON_model = svm.SVC()

CV_svc = GridSearchCV(CON_model, param_grid=grid, n_jobs=-1, scoring="f1_micro")
CV_svc.fit(train_x_vec, train_y_CON)

GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
                         'kernel': ['linear', 'rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1_micro', verbose=0)

In [46]:
print("Best hyperparameters for Conscientiousness: {}".format(CV_svc.best_params_))
print("Best F1-score for Conscientiousness: {}".format(CV_svc.best_score_))

Best hyperparameters for Conscientiousness: {'C': 1, 'kernel': 'rbf'}
Best F1-score for Conscientiousness: 0.5401754385964913


In [47]:
clf_svm_CON = CV_svc.best_estimator_
clf_svm_CON.fit(train_x_vec, train_y_CON)

print("Accuracy score in detecting Conscientious personality trait: ", clf_svm_OPN.score(test_x_vec, test_y_CON))

Accuracy score in detecting Conscientious personality trait:  0.4787234042553192


In [48]:
y_pred = clf_svm_CON.predict(test_x_vec)

print("F1-micro: {:.4f}".format(f1_score(test_y_CON, y_pred, average = "micro")))
print("Precision: {:.4f}".format(precision_score(test_y_CON, y_pred)))
print("Recall: {:.4f}".format(recall_score(test_y_CON, y_pred, average = "micro")))

F1-micro: 0.5532
Precision: 0.6667
Recall: 0.5532


In [49]:
print(classification_report(test_y_CON, y_pred))

              precision    recall  f1-score   support

           0       0.52      0.84      0.64        45
           1       0.67      0.29      0.40        49

    accuracy                           0.55        94
   macro avg       0.59      0.57      0.52        94
weighted avg       0.60      0.55      0.52        94



In [50]:
model_path = "./models/CON_svm_model.pkl"  
with open(model_path, 'wb') as file:
    pickle.dump(clf_svm_CON, file)

### Extroversion

In [51]:
EXT_model = svm.SVC()

CV_svc = GridSearchCV(EXT_model, param_grid = grid, n_jobs= -1, scoring = "f1_micro")
CV_svc.fit(train_x_vec,train_y_EXT)

GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
                         'kernel': ['linear', 'rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1_micro', verbose=0)

In [52]:
print("Best hyperparameters for Extroversion: {}".format(CV_svc.best_params_))
print("Best F1-score for Extroversion: {}".format(CV_svc.best_score_))

Best hyperparameters for Extroversion: {'C': 3, 'kernel': 'linear'}
Best F1-score for Extroversion: 0.5744561403508772


In [53]:
clf_svm_EXT = CV_svc.best_estimator_
clf_svm_EXT.fit(train_x_vec, train_y_EXT)

print("Accuracy score of detecting Extroversion: ", clf_svm_EXT.score(test_x_vec, test_y_EXT))

Accuracy score of detecting Extroversion:  0.5212765957446809


In [54]:
y_pred = clf_svm_EXT.predict(test_x_vec)

print("F1-micro: {:.4f}".format(f1_score(test_y_EXT, y_pred, average = "micro")))
print("Precision: {:.4f}".format(precision_score(test_y_EXT, y_pred)))
print("Recall: {:.4f}".format(recall_score(test_y_EXT, y_pred, average = "micro")))

F1-micro: 0.5213
Precision: 0.5833
Recall: 0.5213


In [55]:
print(classification_report(test_y_EXT, y_pred))

              precision    recall  f1-score   support

           0       0.46      0.51      0.48        41
           1       0.58      0.53      0.55        53

    accuracy                           0.52        94
   macro avg       0.52      0.52      0.52        94
weighted avg       0.53      0.52      0.52        94



In [56]:
model_path = "./models/EXT_svm_model.pkl"  
with open(model_path, 'wb') as file:  
    pickle.dump(clf_svm_EXT, file)

### Agreeableness

In [57]:
AGR_model = svm.SVC()

CV_svc = GridSearchCV(AGR_model, param_grid = grid, n_jobs= -1, scoring = "f1_micro")
CV_svc.fit(train_x_vec,train_y_AGR)

GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
                         'kernel': ['linear', 'rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1_micro', verbose=0)

In [58]:
print("Best hyperparameters for Agreeableness: {}".format(CV_svc.best_params_))
print("Best F1-score for Agreeableness: {}".format(CV_svc.best_score_))

Best hyperparameters for Agreeableness: {'C': 3, 'kernel': 'rbf'}
Best F1-score for Agreeableness: 0.5718245614035088


In [59]:
clf_svm_AGR = CV_svc.best_estimator_
clf_svm_AGR.fit(train_x_vec, train_y_AGR)

print("Accuracy score of detecting Agreeableness: ", clf_svm_AGR.score(test_x_vec, test_y_AGR))

Accuracy score of detecting Agreeableness:  0.43617021276595747


In [60]:
y_pred = clf_svm_AGR.predict(test_x_vec)

print("F1-micro: {:.4f}".format(f1_score(test_y_AGR, y_pred, average = "micro")))
print("Precision: {:.4f}".format(precision_score(test_y_AGR, y_pred)))
print("Recall: {:.4f}".format(recall_score(test_y_AGR, y_pred, average = "micro")))

F1-micro: 0.4362
Precision: 0.3226
Recall: 0.4362


In [61]:
print(classification_report(test_y_AGR, y_pred))

              precision    recall  f1-score   support

           0       0.49      0.60      0.54        52
           1       0.32      0.24      0.27        42

    accuracy                           0.44        94
   macro avg       0.41      0.42      0.41        94
weighted avg       0.42      0.44      0.42        94



In [62]:
model_path = "./models/AGR_svm_model.pkl"  
with open(model_path, 'wb') as file:  
    pickle.dump(clf_svm_AGR, file)

### Neuroticism

In [63]:
NEU_model = svm.SVC()

CV_svc = GridSearchCV(NEU_model, param_grid = grid, n_jobs= -1, scoring = "f1_micro")
CV_svc.fit(train_x_vec,train_y_NEU)

GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
                         'kernel': ['linear', 'rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1_micro', verbose=0)

In [64]:
print("Best hyperparameters for Neuroticism are: {}".format(CV_svc.best_params_))
print("Best F1-score for Neuroticism is: {}".format(CV_svc.best_score_))

Best hyperparameters for Neuroticism are: {'C': 1, 'kernel': 'rbf'}
Best F1-score for Neuroticism is: 0.51059649122807


In [65]:
clf_svm_NEU = CV_svc.best_estimator_
clf_svm_NEU.fit(train_x_vec, train_y_NEU)

print("Accuracy score of detecting Neuroticism: ", clf_svm_NEU.score(test_x_vec, test_y_NEU))

Accuracy score of detecting Neuroticism:  0.4574468085106383


In [66]:
y_pred = clf_svm_NEU.predict(test_x_vec)

print("F1-micro: {:.4f}".format(f1_score(test_y_NEU, y_pred, average = "micro")))
print("Precision: {:.4f}".format(precision_score(test_y_NEU, y_pred)))
print("Recall: {:.4f}".format(recall_score(test_y_NEU, y_pred, average = "micro")))

F1-micro: 0.4574
Precision: 0.5125
Recall: 0.4574


In [67]:
print(classification_report(test_y_NEU, y_pred))

              precision    recall  f1-score   support

           0       0.14      0.05      0.07        41
           1       0.51      0.77      0.62        53

    accuracy                           0.46        94
   macro avg       0.33      0.41      0.34        94
weighted avg       0.35      0.46      0.38        94



In [68]:
model_path = "./models/NEU_svm_model.pkl"  
with open(model_path, 'wb') as file:  
    pickle.dump(clf_svm_NEU, file)

## Composing an SVM prediction pipeline with all personalities

In [69]:
loaded_vec = TfidfVectorizer(decode_error="replace", vocabulary=pickle.load(open("./tfidf_vocab/svm_vocab.pkl", "rb")))

In [70]:
def predict_big_five(essay): 
    path = "./models/"
    extension = "_svm_model.pkl"
    
    with open(path+"AGR"+extension, 'rb') as file:  
        AGR_model = pickle.load(file)
    
    with open(path+"CON"+extension, 'rb') as file:  
        CON_model = pickle.load(file)
    
    with open(path+"EXT"+extension, 'rb') as file:  
        EXT_model = pickle.load(file)
    
    with open(path+"NEU"+extension, 'rb') as file:  
        NEU_model = pickle.load(file)
    
    with open(path+"OPN"+extension, 'rb') as file:  
        OPN_model = pickle.load(file)
    
    # building a feature matrix:
    transformer = TfidfTransformer()
    features = transformer.fit_transform(loaded_vec.fit_transform([preprocess(essay)]))
    
    # getting all 5 predictions:
    
    AGR_prediction = int(AGR_model.predict(features.toarray()))
    CON_prediction = int(CON_model.predict(features.toarray()))
    EXT_prediction = int(EXT_model.predict(features.toarray()))
    NEU_prediction = int(NEU_model.predict(features.toarray()))
    OPN_prediction = int(OPN_model.predict(features.toarray()))
    
    # reporting on all 5 predictions in one report:
    final_prediction = {
        "Openness": OPN_prediction,
        "Conscientiousness": CON_prediction,
        "Extroversion": EXT_prediction,
        "Agreeableness": AGR_prediction,
        "Neuroticism": NEU_prediction
    }
    
    return final_prediction

In [71]:
test_essay = data.Essay[13]
result = predict_big_five(test_essay)

result

{'Openness': 1,
 'Conscientiousness': 0,
 'Extroversion': 1,
 'Agreeableness': 0,
 'Neuroticism': 1}

# Above we provided a possible solution for assembling a binary SVM pipeline to predict Big Five personality traits in the Dutch language.