In [47]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
import joblib

from sklearn import utils
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report


from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [17]:
df = pd.read_pickle("00_unwell_well_dataset.pkl")

In [18]:
df.head()

Unnamed: 0,cleaned_text,category
5,f abused different people young age,unwell
6,grew dad laying top woke staed continued close...,unwell
7,would call mommy ask come wipe bathroom,unwell
9,never anything said things stayed away,unwell
10,seventh grade became depressed staed self harming,unwell


In [20]:
train, test = model_selection.train_test_split(df, test_size=0.3, random_state=42)

In [22]:
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens

train_tagged = train.apply(
    lambda r: TaggedDocument(words=tokenize_text(r.cleaned_text), tags=[r.category]), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=tokenize_text(r.cleaned_text), tags=[r.category]), axis=1)

In [33]:
train_tagged.values[69]

TaggedDocument(words=['ive', 'considered', 'cutting', 'declaring', 'independent', 'several', 'problems', 'theyre', 'paying', 'tuition', 'phone', 'plan'], tags=['unwell'])

In [34]:
import multiprocessing
cores = multiprocessing.cpu_count()

In [37]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 3326/3326 [00:00<00:00, 890934.67it/s]


In [40]:
%%time
for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 3326/3326 [00:00<00:00, 930389.16it/s]
100%|██████████| 3326/3326 [00:00<00:00, 2181090.54it/s]
100%|██████████| 3326/3326 [00:00<00:00, 2845829.27it/s]
100%|██████████| 3326/3326 [00:00<00:00, 2862765.26it/s]
100%|██████████| 3326/3326 [00:00<00:00, 2855148.40it/s]
100%|██████████| 3326/3326 [00:00<00:00, 2760240.42it/s]
100%|██████████| 3326/3326 [00:00<00:00, 2721469.98it/s]
100%|██████████| 3326/3326 [00:00<00:00, 2833114.36it/s]
100%|██████████| 3326/3326 [00:00<00:00, 2586254.19it/s]
100%|██████████| 3326/3326 [00:00<00:00, 1667892.77it/s]
100%|██████████| 3326/3326 [00:00<00:00, 2693619.44it/s]
100%|██████████| 3326/3326 [00:00<00:00, 2774513.74it/s]
100%|██████████| 3326/3326 [00:00<00:00, 2847571.97it/s]
100%|██████████| 3326/3326 [00:00<00:00, 1570444.12it/s]
100%|██████████| 3326/3326 [00:00<00:00, 2502287.91it/s]
100%|██████████| 3326/3326 [00:00<00:00, 2582424.12it/s]
100%|██████████| 3326/3326 [00:00<00:00, 2657191.45it/s]
100%|██████████| 3326/3326 [00:0

CPU times: user 5.09 s, sys: 1.26 s, total: 6.35 s
Wall time: 4.49 s


In [41]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [43]:
y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)

logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [44]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      unwell       0.72      0.74      0.73       718
        well       0.73      0.71      0.72       708

    accuracy                           0.72      1426
   macro avg       0.72      0.72      0.72      1426
weighted avg       0.72      0.72      0.72      1426



In [45]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(X_train,y_train)

y_pred = SVM.predict(X_test)

In [46]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      unwell       0.80      0.62      0.70       718
        well       0.69      0.85      0.76       708

    accuracy                           0.73      1426
   macro avg       0.75      0.73      0.73      1426
weighted avg       0.75      0.73      0.73      1426



In [49]:
joblib.dump(SVM, '01_svm_model.pkl', compress=9)

['01_svm_model.pkl']