In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
from pandarallel import pandarallel

from sklearn import utils
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [2]:
# java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000

In [3]:
def get_behaviors_string(string):
    if not string: return ""
    
    string = re.sub(r'[^\w]', ' ', string)
    
    res = ""
    
    dic = get_behavior_breakdown(string)

    
    for key in dic:
        arr = dic[key]
        for i in arr:
            res += i + " "
        res += key + " "
        
        
    return res

In [4]:
# df = pd.read_pickle("./00_Dataset Creation/dataset.pkl")

In [5]:
# df.head()

In [6]:
def get_behaviors_string(string):
    if not string: return ""
    
    string = re.sub(r'[^\w]', ' ', string)
    
    res = ""
    
    dic = get_behavior_breakdown(string)

    
    for key in dic:
        arr = dic[key]
        for i in arr:
            res += i + " "
        res += key + " "
        
        
    return res

In [7]:
# df["behavioral"] = df.apply(lambda x: get_behaviors_string(x["narrative"]), axis=1)

In [8]:
# df.to_pickle("02_behavioral_dataset.pkl")

# Import (instead of training data again)

In [25]:
df = pd.read_pickle("./02_Behavioral/02_behavioral_dataset.pkl")

# Training

In [26]:
train, test = model_selection.train_test_split(df, test_size=0.3, random_state=42)

In [27]:
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens

train_tagged = train.apply(
    lambda r: TaggedDocument(words=tokenize_text(r.behavioral), tags=[r.label]), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=tokenize_text(r.behavioral), tags=[r.label]), axis=1)

In [28]:
train_tagged.values[69]

TaggedDocument(words=['considered', 'cutting', 'declaring', 'independent', 'are', 're', 'paying'], tags=['unwell'])

In [29]:
import multiprocessing
cores = multiprocessing.cpu_count()

In [30]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 3326/3326 [00:00<00:00, 1192431.41it/s]


In [31]:
# %%time
# for epoch in range(30):
#     model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
#     model_dbow.alpha -= 0.002
#     model_dbow.min_alpha = model_dbow.alpha

model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1000)
model_dbow.alpha -= 0.002
model_dbow.min_alpha = model_dbow.alpha
model_dbow.save("doc2vec_second.model")

100%|██████████| 3326/3326 [00:00<00:00, 951067.30it/s]


In [32]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [33]:
y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)

logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [34]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      unwell       0.84      0.80      0.82       718
        well       0.81      0.85      0.83       708

    accuracy                           0.82      1426
   macro avg       0.82      0.82      0.82      1426
weighted avg       0.83      0.82      0.82      1426



In [35]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(X_train,y_train)

y_pred = SVM.predict(X_test)

In [36]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      unwell       0.86      0.78      0.82       718
        well       0.80      0.87      0.83       708

    accuracy                           0.83      1426
   macro avg       0.83      0.83      0.83      1426
weighted avg       0.83      0.83      0.83      1426



In [37]:
joblib.dump(SVM, '02_svm_model.pkl', compress=9)

['02_svm_model.pkl']