In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
import joblib

from sklearn import utils
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [2]:
df = pd.read_pickle("./00_Dataset Creation/dataset.pkl")

In [3]:
df

Unnamed: 0,narrative,label
5,\nI (20 F) have been abused by different peopl...,unwell
6,I grew up with my dad laying on top of me when...,unwell
7,He would call me mommy and ask me to come wipe...,unwell
9,I never did anything when he said those things...,unwell
10,\n\nWhen I was in seventh grade I became depre...,unwell
...,...,...
2371,Taking my morning walk and having a cup of cof...,well
2372,"I ate my favorite meal, hot chicken.",well
2373,I was happy to have a taco Tuesday with my two...,well
2374,Buying a new TV.,well


In [4]:
train, test = model_selection.train_test_split(df, test_size=0.3, random_state=42)

In [5]:
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens

train_tagged = train.apply(
    lambda r: TaggedDocument(words=tokenize_text(r.narrative), tags=[r.label]), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=tokenize_text(r.narrative), tags=[r.label]), axis=1)

In [6]:
train_tagged.values[69]

TaggedDocument(words=["'ve", 'considered', 'cutting', 'them', 'off', 'and', 'declaring', 'myself', 'independent', 'but', 'there', 'are', 'several', 'problems', 'they', "'re", 'paying', 'for', 'my', 'tuition', 'and', 'my', 'phone', 'plan'], tags=['unwell'])

In [7]:
import multiprocessing
cores = multiprocessing.cpu_count()

In [8]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 3326/3326 [00:00<00:00, 1043633.96it/s]


In [9]:
# %%time
# for epoch in range(30):
#     model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=100)
#     model_dbow.alpha -= 0.002
#     model_dbow.min_alpha = model_dbow.alpha

model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=400)
model_dbow.alpha -= 0.002
model_dbow.min_alpha = model_dbow.alpha
model_dbow.save("doc2vec_first.model")

100%|██████████| 3326/3326 [00:00<00:00, 983104.66it/s]


In [10]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [11]:
y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)

logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      unwell       0.95      0.91      0.93       718
        well       0.92      0.95      0.93       708

    accuracy                           0.93      1426
   macro avg       0.93      0.93      0.93      1426
weighted avg       0.93      0.93      0.93      1426



In [13]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(X_train,y_train)

y_pred = SVM.predict(X_test)

In [14]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      unwell       0.93      0.93      0.93       718
        well       0.93      0.93      0.93       708

    accuracy                           0.93      1426
   macro avg       0.93      0.93      0.93      1426
weighted avg       0.93      0.93      0.93      1426



In [15]:
joblib.dump(SVM, '01_svm_model.pkl', compress=9)

['01_svm_model.pkl']