**Data preprocessing**

In [1]:
#Import all necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import string
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import  confusion_matrix

In [2]:
#Adding label to audio_text

label_data = pd.read_csv('../input/audio-data-label/Data_label.csv', encoding="UTF-8")
participant = list(label_data["Participant_ID"])
depressed = list(label_data["Depressed"])
label_data = {participant[i]:depressed[i] for i in range(len(participant))}

data = pd.read_csv('../input/model2-speech-to-text/audio_text.csv', encoding="UTF-8")
#Drop the rows with null columns
data = data.dropna()

for x, y in label_data.items():
  data.loc[data["Person"] == x,"Depressed"] = y

data.head()


Unnamed: 0,Person,Audio_text,Depressed
0,335,Doorbell right now. Okay sounds good. Yes. I'm...,1.0
1,424,So where's my little button. Are you bringing ...,0.0
2,399,Funny just move this i'm going to move this be...,0.0
3,331,Alright. Kraken. Thank you. Yes. Okay. Connect...,0.0
4,364,Therapy like this instead of fur. With a live ...,0.0


**Text analysis with NLTK and Vader Sentiment analyzer**

In [3]:
#Text cleaning
data['Clean_text'] = data['Audio_text'].str.lower()

#removing punctuations
data['Clean_text'] = data['Clean_text'].str.translate(str.maketrans('','',string.punctuation))

#Remove stopwords (words that are so commonly used that they carry very little useful information)
stop_words = stopwords.words('english')+['the', 'a', 'an', 'i', 'he', 'she', 'they', 'to', 'of', 'it', 'from']

def remove_stopwords(stopWords, review_txt):
    newtxt = ' '.join([word for word in review_txt.split() if word not in stopWords])
    return newtxt
data['Clean_text'] = [remove_stopwords(stop_words,x) for x in data['Clean_text']]
data.head()

Unnamed: 0,Person,Audio_text,Depressed,Clean_text
0,335,Doorbell right now. Okay sounds good. Yes. I'm...,1.0,doorbell right okay sounds good yes im okay im...
1,424,So where's my little button. Are you bringing ...,0.0,wheres little button bringing okay alright oka...
2,399,Funny just move this i'm going to move this be...,0.0,funny move im going move behind arm agree okay...
3,331,Alright. Kraken. Thank you. Yes. Okay. Connect...,0.0,alright kraken thank yes okay connecticut well...
4,364,Therapy like this instead of fur. With a live ...,0.0,therapy like instead fur live going put nfl te...


In [4]:
#Vader Sentiment analyzer

sentiment_model = SentimentIntensityAnalyzer()
negativity_score = []

for text in data['Clean_text']:
    #The compound value reflects the overall sentiment ranging from -1 being very negative and +1 being very positive.
    
    sentiment_results = sentiment_model.polarity_scores(text)   # {neg,neu,pos,compound}
    negativity_score.append(sentiment_results['neg'])
    
data['Negativity_score'] = negativity_score

data.head()

Unnamed: 0,Person,Audio_text,Depressed,Clean_text,Negativity_score
0,335,Doorbell right now. Okay sounds good. Yes. I'm...,1.0,doorbell right okay sounds good yes im okay im...,0.122
1,424,So where's my little button. Are you bringing ...,0.0,wheres little button bringing okay alright oka...,0.106
2,399,Funny just move this i'm going to move this be...,0.0,funny move im going move behind arm agree okay...,0.096
3,331,Alright. Kraken. Thank you. Yes. Okay. Connect...,0.0,alright kraken thank yes okay connecticut well...,0.128
4,364,Therapy like this instead of fur. With a live ...,0.0,therapy like instead fur live going put nfl te...,0.07


**LeXmo: The first Python package for classifying emotions in English texts**

In [5]:
pip install LeXmo

Collecting LeXmo
  Downloading LeXmo-0.1.4-py3-none-any.whl (6.9 kB)
Installing collected packages: LeXmo
Successfully installed LeXmo-0.1.4
[0mNote: you may need to restart the kernel to use updated packages.


In [6]:
from LeXmo import LeXmo

sadness_score = []
fear_score = []
anger_score = []

for text in data['Clean_text']:
    result = LeXmo.LeXmo(text)
    sadness_score.append(result['sadness'])
    fear_score.append(result['fear'])
    anger_score.append(result['anger'])
    
data['Sadness_score'] = sadness_score
data['Fear_score'] = fear_score
data['Anger_score'] = anger_score

data.head()    

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,Person,Audio_text,Depressed,Clean_text,Negativity_score,Sadness_score,Fear_score,Anger_score
0,335,Doorbell right now. Okay sounds good. Yes. I'm...,1.0,doorbell right okay sounds good yes im okay im...,0.122,0.031148,0.016393,0.019672
1,424,So where's my little button. Are you bringing ...,0.0,wheres little button bringing okay alright oka...,0.106,0.032164,0.035088,0.019493
2,399,Funny just move this i'm going to move this be...,0.0,funny move im going move behind arm agree okay...,0.096,0.029361,0.027634,0.029361
3,331,Alright. Kraken. Thank you. Yes. Okay. Connect...,0.0,alright kraken thank yes okay connecticut well...,0.128,0.028846,0.032051,0.027244
4,364,Therapy like this instead of fur. With a live ...,0.0,therapy like instead fur live going put nfl te...,0.07,0.020652,0.017391,0.015761


**Model**

In [7]:
X =  data['Clean_text']
X.append(data['Negativity_score'])
X.append(data['Fear_score'])
X.append(data['Anger_score'])

Y = data['Depressed']

X_train, X_test, Y_train,Y_test = train_test_split(X,Y,test_size=0.3,random_state=20)


features = CountVectorizer()
X_train = features.fit_transform(X_train)
X_test = features.transform(X_test)


models=[]
models.append(('LR',LogisticRegression(solver='liblinear')))
models.append(('DTR',DecisionTreeClassifier()))
models.append(('KNN',KNeighborsClassifier()))
# models.append(('GNB',GaussianNB()))
models.append(('RFC',RandomForestClassifier(n_estimators = 50, criterion = 'entropy')))
best_accuracy = 0
Model = ""
for name,model in models:
    scores = cross_val_score(model,X_train,Y_train,cv=5)
    print(name,end='\n')
    print('Cross validation:',np.mean(scores))
    model.fit(X_train,Y_train)
    acc = model.score(X_test,Y_test)
    print('Test accuracy:',acc )
    Y_pred = model.predict(X_test)
    print('Report\n',classification_report(Y_test,Y_pred))
    print('confusion_matrix\n', confusion_matrix(Y_test,Y_pred))
    if acc > best_accuracy:
        best_accuracy = acc
        best_model = model
        Model = name
print('\n',Model)        
print('Best accuracy',best_accuracy)   
Y_pred = model.predict(X_test)
print('Report\n',classification_report(Y_test,Y_pred))
print('confusion_matrix\n', confusion_matrix(Y_test,Y_pred))

LR
Cross validation: 0.7384615384615384
Test accuracy: 0.6491228070175439
Report
               precision    recall  f1-score   support

         0.0       0.67      0.89      0.76        36
         1.0       0.56      0.24      0.33        21

    accuracy                           0.65        57
   macro avg       0.61      0.56      0.55        57
weighted avg       0.63      0.65      0.60        57

confusion_matrix
 [[32  4]
 [16  5]]
DTR
Cross validation: 0.7769230769230768
Test accuracy: 0.7192982456140351
Report
               precision    recall  f1-score   support

         0.0       0.74      0.86      0.79        36
         1.0       0.67      0.48      0.56        21

    accuracy                           0.72        57
   macro avg       0.70      0.67      0.68        57
weighted avg       0.71      0.72      0.71        57

confusion_matrix
 [[31  5]
 [11 10]]
KNN
Cross validation: 0.6769230769230768
Test accuracy: 0.6491228070175439
Report
               precision 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
