In [35]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
import joblib

from sklearn import utils
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

#Text Analysis
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.style as style
from textblob import TextBlob as tb
from IPython.display import Image as im
style.use('fivethirtyeight')

In [36]:
# SVM model imports
model_doc = joblib.load("./Exported_Models/01_svm_model.pkl")
model_beh = joblib.load("./Exported_Models/02_svm_model.pkl")
model_emo = joblib.load("./Exported_Models/03_svm_model.pkl")

# Doc2Vec model imports
doc2vec_doc = Doc2Vec.load("./Exported_Models/doc2vec.model")
doc2vec_beh = Doc2Vec.load("./Exported_Models/behavioral.model")

#test imports
X_test = joblib.load("./Exported_Models/X_test.pkl")
y_test = joblib.load("./Exported_Models/y_test.pkl")
test = joblib.load("./Exported_Models/test.pkl")

doc ---> 1st model (leftmost)
beh ---> 2nd model (middle)
emo ---> 3rd model (rightmost)

In [13]:
%run './Utils/COMBINED.ipynb'

In [42]:
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens

def get_behaviors_string(string):
    if not string: return ""
    
    string = re.sub(r'[^\w]', ' ', string)
    
    res = ""
    
    dic = get_behavior_breakdown(string)

    
    for key in dic:
        arr = dic[key]
        for i in arr:
            res += i + " "
        res += key + " "
        
        
    return res

def doc_infer_vector(string):
    li = tokenize_text(string)
    
    return doc2vec_doc.infer_vector(li)

def beh_infer_vector(string):
    string = get_behaviors_string(string)
    
    li = tokenize_text(string)
    
    return doc2vec_beh.infer_vector(li)

def emo_infer_freq(string):
    freq = get_sentiment_breakdown(string)
    df = pd.DataFrame(columns=["negative", "positive", "fear", "anger", "trust", "sadness", "disgust", "anticip", "surprise", "joy", "anticipation"])
    df = df.append(freq, ignore_index=True)
    df = df.fillna(0)
    return df.iloc[0].tolist()

In [38]:
test["doc_vec"] = test.cleaned_text.apply(lambda x: doc_infer_vector(x))
test["beh_vec"] = test.cleaned_text.apply(lambda x: beh_infer_vector(x))
test["emo_vec"] = test.cleaned_text.apply(lambda x: emo_infer_freq(x))

In [54]:
test["doc_label"] = test.doc_vec.apply(lambda x: model_doc.predict(x.reshape(1, -1))[0])

In [55]:
test["beh_label"] = test.beh_vec.apply(lambda x: model_beh.predict(x.reshape(1, -1))[0])

In [48]:
test["emo_label"] = test.emo_vec.apply(lambda x: model_emo.predict([x])[0])

In [51]:
def finalize(a, b, c):
    li = [a, b, c]
    
    def most_frequent(List): 
        counter = 0
        num = List[0] 
      
        for i in List: 
            curr_frequency = List.count(i) 
            if(curr_frequency> counter): 
                counter = curr_frequency 
                num = i 
  
        return num

    return most_frequent(li)

In [62]:
test.head(2)

Unnamed: 0,narrative,label,cleaned_text,behavior,emotion,emotion_array,doc_vec,beh_vec,emo_vec,doc_label,beh_label,emo_label
3975,Over the past couple of weeks my anxiety was a...,unwell,past couple weeks anxiety low,,"{'fear': 0.2, 'anger': 0.2, 'anticip': 0.0, 't...","[0.2, 0.0, 0.2, 0.2, 0.0, 0.2, 0.0, 0.0, 0.0, ...","[-0.52901137, -0.1970458, -1.0356841, -0.00980...","[0.00016271167, 0.0007172979, 0.00034254458, 0...","[0.2, 0.0, 0.2, 0.2, 0.0, 0.2, 0.0, 0.0, 0.0, ...",unwell,well,unwell
2932,My last memory of him will be watching him smi...,unwell,last memory watching smirk cried,cried watching,"{'fear': 0.0, 'anger': 0.0, 'anticip': 0.0, 't...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.83032066, 0.9040563, -1.0618615, 0.47434443...","[-0.05315947, -0.027775932, -0.07356696, -0.00...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",well,well,unwell


In [66]:
test["final"] = test.apply(lambda x: finalize(x.doc_label, x.beh_label, x.emo_label), axis=1)

In [69]:
print(classification_report(test.label, test.final))

              precision    recall  f1-score   support

      unwell       0.94      0.79      0.85       718
        well       0.81      0.94      0.87       708

    accuracy                           0.87      1426
   macro avg       0.87      0.87      0.86      1426
weighted avg       0.88      0.87      0.86      1426

