In [238]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

from nltk.corpus import stopwords
stopwords = stopwords.words('english')



# First Model

## Unwell Dataset

In [239]:
unwell = pd.read_excel("./00_unwell.xlsx")

In [240]:
def finalize(mer, jerome, loyd):
    
    li = [mer, jerome, loyd]
    
    def most_frequent(List): 
        counter = 0
        num = List[0] 

        for i in List: 
            curr_frequency = List.count(i) 
            if(curr_frequency> counter): 
                counter = curr_frequency 
                num = i 

        return num 

    return most_frequent(li)

In [241]:
unwell["category"] = unwell.apply(lambda x: finalize(x["mer"], x["jerome"], x["loyd"]), axis=1)

In [242]:
unwell = unwell[unwell.category == "unwell"]

In [243]:
def clean_text(df, column_name):
    df['cleaned_text'] = df[column_name].fillna('')
    df['cleaned_text'] = df['cleaned_text'].str.lower()
    df['cleaned_text'] = df['cleaned_text'].str.replace(r'(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|rt|\d+', '')
    df['cleaned_text'] = df['cleaned_text'].str.replace(r'^\s+|\s+$', '') 
    df['cleaned_text'] = df['cleaned_text'].apply(lambda x: ' '.join([w for w in x.split() if w not in (stopwords)]))
    return df

In [244]:
unwell = clean_text(unwell, "sentence")

  df['cleaned_text'] = df['cleaned_text'].str.replace(r'(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|rt|\d+', '')
  df['cleaned_text'] = df['cleaned_text'].str.replace(r'^\s+|\s+$', '')


In [245]:
unwell = unwell[["cleaned_text", "category"]]

In [246]:
unwell.head()

Unnamed: 0,cleaned_text,category
5,f abused different people young age,unwell
6,grew dad laying top woke staed continued close...,unwell
7,would call mommy ask come wipe bathroom,unwell
9,never anything said things stayed away,unwell
10,seventh grade became depressed staed self harming,unwell


## Well Dataset

In [250]:
well = pd.read_csv("./00_cleaned_hm.csv")

In [251]:
well = well[well.ground_truth_category.notnull()]

In [252]:
well["category"] = well.ground_truth_category.apply(lambda x: "well")

In [253]:
well = well[0:len(unwell)].copy()

In [254]:
well = well[["cleaned_hm", "category"]]

In [255]:
well = well.rename(columns={"cleaned_hm": "cleaned_text"})

In [256]:
well.head()

Unnamed: 0,cleaned_text,category
3,We had a serious talk with some friends of our...,well
5,I meditated last night.,well
24,My grandmother start to walk from the bed afte...,well
32,I picked my daughter up from the airport and w...,well
42,when i received flowers from my best friend,well


## Merging

In [257]:
data = unwell.append(well)

In [258]:
data.head()

Unnamed: 0,cleaned_text,category
5,f abused different people young age,unwell
6,grew dad laying top woke staed continued close...,unwell
7,would call mommy ask come wipe bathroom,unwell
9,never anything said things stayed away,unwell
10,seventh grade became depressed staed self harming,unwell


In [259]:
len(data)

4752

## Train Test Split

In [260]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(data['cleaned_text'],data['category'],test_size=0.3)

In [261]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [262]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(df['cleaned_text'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [263]:
# print(Tfidf_vect.vocabulary_)

In [264]:
# print(Train_X_Tfidf)

## Training

In [265]:
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)

predictions_NB = Naive.predict(Test_X_Tfidf)

print("Naive Bayes Accuracy Score: ", accuracy_score(predictions_NB, Test_Y)*100)

Naive Bayes Accuracy Score:  93.12762973352034


In [266]:
print(classification_report(Test_Y, predictions_NB))

              precision    recall  f1-score   support

           0       0.97      0.89      0.93       718
           1       0.90      0.97      0.93       708

    accuracy                           0.93      1426
   macro avg       0.93      0.93      0.93      1426
weighted avg       0.93      0.93      0.93      1426



In [267]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)

predictions_SVM = SVM.predict(Test_X_Tfidf)

print("SVM Accuracy Score: ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score:  92.63674614305751


In [268]:
print(classification_report(Test_Y, predictions_SVM))

              precision    recall  f1-score   support

           0       0.91      0.95      0.93       718
           1       0.95      0.90      0.92       708

    accuracy                           0.93      1426
   macro avg       0.93      0.93      0.93      1426
weighted avg       0.93      0.93      0.93      1426



# Second Model