In [25]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

from nltk.corpus import stopwords
stopwords = stopwords.words('english')

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

In [26]:
# %run COMBINED.ipynb

# First Model

## Unwell Dataset

In [27]:
unwell = pd.read_excel("./00_unwell.xlsx")

In [28]:
def filter(mer, jerome, loyd):
    li = [mer, jerome, loyd]
    
    def most_frequent(List): 
        counter = 0
        num = List[0] 

        for i in List: 
            curr_frequency = List.count(i) 
            if(curr_frequency> counter): 
                counter = curr_frequency 
                num = i 

        return num
    
    return most_frequent(li)

In [29]:
unwell["category"] = unwell.apply(lambda x: filter(x.mer, x.jerome, x.loyd), axis=1)

In [30]:
unwell = unwell[unwell.category == "unwell"]

In [31]:
def clean_text(df, column_name):
    df['cleaned_text'] = df[column_name].fillna('')
    df['cleaned_text'] = df['cleaned_text'].str.lower()
    df['cleaned_text'] = df['cleaned_text'].str.replace(r'(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|rt|\d+', '')
    df['cleaned_text'] = df['cleaned_text'].str.replace(r'^\s+|\s+$', '') 
    df['cleaned_text'] = df['cleaned_text'].apply(lambda x: ' '.join([w for w in x.split() if w not in (stopwords)]))
    return df

In [32]:
unwell = clean_text(unwell, "sentence")

  df['cleaned_text'] = df['cleaned_text'].str.replace(r'(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|rt|\d+', '')
  df['cleaned_text'] = df['cleaned_text'].str.replace(r'^\s+|\s+$', '')


In [33]:
unwell = unwell[["cleaned_text", "category"]]

In [34]:
unwell.head()

Unnamed: 0,cleaned_text,category
5,f abused different people young age,unwell
6,grew dad laying top woke staed continued close...,unwell
7,would call mommy ask come wipe bathroom,unwell
9,never anything said things stayed away,unwell
10,seventh grade became depressed staed self harming,unwell


## Well Dataset

In [35]:
well = pd.read_csv("./00_cleaned_hm.csv")

In [36]:
well = well[well.ground_truth_category.notnull()]

In [37]:
well["category"] = well.ground_truth_category.apply(lambda x: "well")

In [38]:
well = well[0:len(unwell)].copy()

In [39]:
well = well[["cleaned_hm", "category"]]

In [40]:
well = well.rename(columns={"cleaned_hm": "cleaned_text"})

In [41]:
well.head()

Unnamed: 0,cleaned_text,category
3,We had a serious talk with some friends of our...,well
5,I meditated last night.,well
24,My grandmother start to walk from the bed afte...,well
32,I picked my daughter up from the airport and w...,well
42,when i received flowers from my best friend,well


## Merging

In [42]:
data = unwell.append(well)

In [43]:
data.category.value_counts()

unwell    2376
well      2376
Name: category, dtype: int64

In [44]:
len(data)

4752

In [45]:
len(data['cleaned_text'])

4752

In [46]:
len(data['category'])

4752

In [47]:
data.to_csv("00_unwell_well_dataset.csv")

In [54]:
data.to_pickle("00_unwell_well_dataset.pkl")

In [53]:
data.cleaned_text.iloc[0]

'f abused different people young age'

## Doc2Vec

In [48]:
train, test = model_selection.train_test_split(data, test_size=0.3, random_state=42)

In [50]:
import nltk
from nltk.corpus import stopwords

def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens

train_tagged = train.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['cleaned_text']), tags=[r.category]), axis=1)
test_tagged = test.apply(
    lambda r: TaggedDocument(words=tokenize_text(r['cleaned_text']), tags=[r.category]), axis=1)

In [101]:
train_tagged.values[69]

TaggedDocument(words=['ive', 'considered', 'cutting', 'declaring', 'independent', 'several', 'problems', 'theyre', 'paying', 'tuition', 'phone', 'plan'], tags=['unwell'])

In [102]:
import multiprocessing
cores = multiprocessing.cpu_count()

In [103]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 3326/3326 [00:00<00:00, 1193757.92it/s]


In [106]:
from sklearn import utils

# %%time
for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 3326/3326 [00:00<00:00, 842254.13it/s]
100%|██████████| 3326/3326 [00:00<00:00, 2192746.79it/s]
100%|██████████| 3326/3326 [00:00<00:00, 2377344.09it/s]
100%|██████████| 3326/3326 [00:00<00:00, 2493343.18it/s]
100%|██████████| 3326/3326 [00:00<00:00, 2511749.21it/s]
100%|██████████| 3326/3326 [00:00<00:00, 2515825.99it/s]
100%|██████████| 3326/3326 [00:00<00:00, 2525390.13it/s]
100%|██████████| 3326/3326 [00:00<00:00, 2533185.96it/s]
100%|██████████| 3326/3326 [00:00<00:00, 2445268.20it/s]
100%|██████████| 3326/3326 [00:00<00:00, 2512201.53it/s]
100%|██████████| 3326/3326 [00:00<00:00, 2527678.04it/s]
100%|██████████| 3326/3326 [00:00<00:00, 2353678.94it/s]
100%|██████████| 3326/3326 [00:00<00:00, 2422339.83it/s]
100%|██████████| 3326/3326 [00:00<00:00, 2469071.70it/s]
100%|██████████| 3326/3326 [00:00<00:00, 2394482.51it/s]
100%|██████████| 3326/3326 [00:00<00:00, 2431205.14it/s]
100%|██████████| 3326/3326 [00:00<00:00, 2387924.53it/s]
100%|██████████| 3326/3326 [00:0

In [113]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [119]:
y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)

SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(X_train,y_train)

y_pred = SVM.predict(X_test)


from sklearn.metrics import accuracy_score, f1_score
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy 0.7608695652173914
Testing F1 score: 0.7608423987214106


In [120]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      unwell       0.77      0.75      0.76       718
        well       0.75      0.77      0.76       708

    accuracy                           0.76      1426
   macro avg       0.76      0.76      0.76      1426
weighted avg       0.76      0.76      0.76      1426



## Train Test Split

In [83]:
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(data['cleaned_text'], data['category'], test_size=0.3, random_state=42)

In [84]:
Encoder = LabelEncoder()
Y_train = Encoder.fit_transform(Y_train)
Y_test = Encoder.fit_transform(Y_test)

In [51]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(data['cleaned_text'])
X_train_tfidf = Tfidf_vect.transform(X_train)
X_test_tfidf = Tfidf_vect.transform(X_test)

## Training

In [52]:
Naive = naive_bayes.MultinomialNB()
Naive.fit(X_train_tfidf, Y_train)

predictions_NB = Naive.predict(X_test_tfidf)

print("Naive Bayes Accuracy Score: ", accuracy_score(predictions_NB, Y_test)*100)

Naive Bayes Accuracy Score:  98.87798036465638


In [53]:
print(classification_report(Y_test, predictions_NB))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       718
           1       0.98      1.00      0.99       708

    accuracy                           0.99      1426
   macro avg       0.99      0.99      0.99      1426
weighted avg       0.99      0.99      0.99      1426



In [54]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(X_train_tfidf,Y_train)

predictions_SVM = SVM.predict(X_test_tfidf)

print("SVM Accuracy Score: ",accuracy_score(predictions_SVM, Y_test)*100)

SVM Accuracy Score:  99.36886395511921


In [55]:
print(classification_report(Y_test, predictions_SVM))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       718
           1       1.00      0.99      0.99       708

    accuracy                           0.99      1426
   macro avg       0.99      0.99      0.99      1426
weighted avg       0.99      0.99      0.99      1426



# Second Model

In [56]:
def get_behaviors_string(string):
    res = ""
    
    dic = get_behavior_breakdown(string)

    
    for key in dic:
        arr = dic[key]
        for i in arr:
            res += i + " "
        res += key + " "
        
        
    return res

In [57]:
get_behaviors_string("I am not very sad and very angry.")

NameError: name 'get_behavior_breakdown' is not defined

## Third Model

In [None]:
dic = get_sentiment_breakdown("I am not very sad and very angry.")

In [None]:
df = pd.DataFrame(columns=['negative',
'positive',
'fear',
'anger',
'trust',
'sadness',
'disgust',
'anticip',
'joy',
'surprise'])

In [None]:
df = df.append(dic, ignore_index=True)

In [None]:
df

In [None]:
dic