In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [20]:
df = pd.read_csv('essays.csv', encoding='latin1')
df.head()

Unnamed: 0,#AUTHID,TEXT,cEXT,cNEU,cAGR,cCON,cOPN
0,1997_504851.txt,"Well, right now I just woke up from a mid-day ...",n,y,y,n,y
1,1997_605191.txt,"Well, here we go with the stream of consciousn...",n,n,y,n,n
2,1997_687252.txt,An open keyboard and buttons to push. The thin...,n,y,n,y,y
3,1997_568848.txt,I can't believe it! It's really happening! M...,y,n,y,y,n
4,1997_688160.txt,"Well, here I go with the good old stream of co...",y,n,y,n,y


In [21]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['TEXT'], df[['cEXT', 'cNEU', 'cAGR', 'cCON', 'cOPN']], test_size=0.2, random_state=42)


In [22]:
# Preprocess the text data
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [23]:
# Train a logistic regression model for each personality trait
models = {}
for trait in ['cEXT', 'cNEU', 'cAGR', 'cCON', 'cOPN']:
    model = LogisticRegression()
    model.fit(X_train_tfidf, y_train[trait])
    models[trait] = model


In [24]:
# Evaluate the models
results = {}
for trait, model in models.items():
    y_pred = model.predict(X_test_tfidf)
    results[trait] = classification_report(y_test[trait], y_pred)

In [25]:
# Print the results
for trait, result in results.items():
    print(f"Classification Report for {trait}:")
    print(result)

Classification Report for cEXT:
              precision    recall  f1-score   support

           n       0.50      0.48      0.49       227
           y       0.57      0.60      0.59       267

    accuracy                           0.54       494
   macro avg       0.54      0.54      0.54       494
weighted avg       0.54      0.54      0.54       494

Classification Report for cNEU:
              precision    recall  f1-score   support

           n       0.63      0.57      0.60       260
           y       0.57      0.63      0.60       234

    accuracy                           0.60       494
   macro avg       0.60      0.60      0.60       494
weighted avg       0.60      0.60      0.60       494

Classification Report for cAGR:
              precision    recall  f1-score   support

           n       0.49      0.38      0.42       220
           y       0.58      0.68      0.62       274

    accuracy                           0.54       494
   macro avg       0.53      0.5

In [26]:
from sklearn.metrics import accuracy_score

# Initialize a dictionary to store accuracies
accuracies = {}

# Calculate accuracy for each trait
for trait, model in models.items():
    y_pred = model.predict(X_test_tfidf)
    acc = accuracy_score(y_test[trait], y_pred)
    accuracies[trait] = acc

# Print accuracies
for trait, acc in accuracies.items():
    print(f"Accuracy for {trait}: {acc}")

Accuracy for cEXT: 0.5445344129554656
Accuracy for cNEU: 0.5991902834008097
Accuracy for cAGR: 0.5445344129554656
Accuracy for cCON: 0.5546558704453441
Accuracy for cOPN: 0.5991902834008097
