In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier

In [6]:
from sklearn.metrics import confusion_matrix,classification_report

In [7]:
df=pd.read_csv('../Datasets/combined.csv')

In [8]:
df.head()

Unnamed: 0,subreddit,selftext,title,name,is_sucide
0,SuicideWatch,We've been seeing a worrying increase in pro-s...,New wiki on how to avoid accidentally encourag...,t3_cz6nfd,1
1,SuicideWatch,"If you want to recognise an occasion, please d...",Reminder: Absolutely no activism of any kind i...,t3_d2370x,1
2,SuicideWatch,All I want is to be a normal person who can ju...,"Every time I feel optimistic, every time I thi...",t3_gt6nhf,1
3,SuicideWatch,Original post: https://www.reddit.com/r/Suicid...,Wife cheated update,t3_gtdvnt,1
4,SuicideWatch,Basically the title. Every time I or some one ...,How come I can't choose suicide as an option? ...,t3_gt6tdi,1


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1932 entries, 0 to 1931
Data columns (total 5 columns):
subreddit    1932 non-null object
selftext     1883 non-null object
title        1932 non-null object
name         1932 non-null object
is_sucide    1932 non-null int64
dtypes: int64(1), object(4)
memory usage: 75.6+ KB


In [10]:
df.isna().sum()

subreddit     0
selftext     49
title         0
name          0
is_sucide     0
dtype: int64

In [11]:
df.shape

(1932, 5)

## Cleaning Text

In [12]:
df['selftext'].fillna('no text',inplace=True)

In [13]:
wordnet=WordNetLemmatizer()

In [14]:
def clean_text(a):
    x=re.sub('[^a-zA-Z]',' ',a)
    x=x.lower()
    text=nltk.word_tokenize(x)
    text=[wordnet.lemmatize(word) for word in text if word not in stopwords.words('english')]
    text=' '.join(text)
    return text

In [15]:
df['selftext']=df['selftext'].apply(clean_text)

In [16]:
df.head()

Unnamed: 0,subreddit,selftext,title,name,is_sucide
0,SuicideWatch,seeing worrying increase pro suicide content s...,New wiki on how to avoid accidentally encourag...,t3_cz6nfd,1
1,SuicideWatch,want recognise occasion please offering extra ...,Reminder: Absolutely no activism of any kind i...,t3_d2370x,1
2,SuicideWatch,want normal person thing need instead immense ...,"Every time I feel optimistic, every time I thi...",t3_gt6nhf,1
3,SuicideWatch,original post http www reddit com r suicidewat...,Wife cheated update,t3_gtdvnt,1
4,SuicideWatch,basically title every time one know talk suici...,How come I can't choose suicide as an option? ...,t3_gt6tdi,1


In [17]:
df['selftext'][25]

'felt like one heavy day weight ignored couldnt distract way chest felt heavy limb help looked looking way hide end day people finally end fairly crowded place could see edge felt like jumping saw swear ignored though carefully observed place best part somebody offered sleep medicine spit first cup drank suddenly went near held fake doesnt want caught place anything dont want settle lie im awake limb hurt carried dream feel heaviness enveloping hold people id rather holding accidentally gave wound recently knife cooking ive wanting purpose help hope dream better tomorrow'

## Embedding With Bag Of Words and TFIDF 

In [18]:
vect1=CountVectorizer(ngram_range=(1,3))

In [19]:
vect2=TfidfVectorizer(ngram_range=(1,3))

In [20]:
mat1=vect1.fit_transform(df['selftext'])

In [21]:
mat2=vect2.fit_transform(df['selftext'])

## Checking Accuracy with Bag of Words

In [22]:
x_train,x_test,y_train,y_test=train_test_split(mat1,df['is_sucide'],random_state=42,test_size=0.2)

In [23]:
cv=ShuffleSplit(n_splits=5,test_size=0.2,random_state=42)

In [24]:
cross_val_score(RandomForestClassifier(),x_train,y_train,cv=cv,scoring='accuracy')

array([0.62135922, 0.65695793, 0.62135922, 0.60194175, 0.63106796])

In [25]:
cross_val_score(MultinomialNB(),x_train,y_train,cv=cv,scoring='accuracy')

array([0.69902913, 0.70226537, 0.65048544, 0.66666667, 0.64401294])

In [26]:
cross_val_score(XGBClassifier(),x_train,y_train,cv=cv,scoring='accuracy')

array([0.64401294, 0.67961165, 0.62459547, 0.64401294, 0.66990291])

## Checking Accuracy With TFIDF

In [27]:
x_train1,x_test1,y_train1,y_test1=train_test_split(mat2,df['is_sucide'],random_state=42,test_size=0.2)

In [28]:
cross_val_score(RandomForestClassifier(),x_train1,y_train1,cv=cv,scoring='accuracy')

array([0.62783172, 0.66019417, 0.60841424, 0.60517799, 0.58899676])

In [29]:
cross_val_score(MultinomialNB(),x_train1,y_train1,cv=cv,scoring='accuracy')

array([0.71197411, 0.70550162, 0.63106796, 0.64724919, 0.58252427])

In [30]:
cross_val_score(XGBClassifier(),x_train1,y_train1,cv=cv,scoring='accuracy')

array([0.66019417, 0.69255663, 0.5987055 , 0.66343042, 0.65372168])

### MultinomialNB with Count Vectorizer Gives a good Accuracy 

In [33]:
model=MultinomialNB()

In [34]:
model.fit(x_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [35]:
prediction=model.predict(x_test)

In [36]:
confusion_matrix(y_test,prediction)

array([[125,  63],
       [ 46, 153]])

In [37]:
print(classification_report(y_test,prediction))

              precision    recall  f1-score   support

           0       0.73      0.66      0.70       188
           1       0.71      0.77      0.74       199

    accuracy                           0.72       387
   macro avg       0.72      0.72      0.72       387
weighted avg       0.72      0.72      0.72       387



### Predicting With New Values

In [38]:
#Depression Thought
text1='''I don't know, it's like this unexplainable feeling. 
Like this physical feeling in my heart that pulsates throughout my whole being. 
Whenever I hear happy music or watch a cheesy movie where everybody's happy I just can't help but feel sad. 
Same thing if I see something from when I was a kid or something similar. 
I'm thinking the root of it may be I've felt like I've missed out on life or didn't make the most of it? 
I don't know. I'm just hoping it's not just me.'''

In [44]:
x=vect1.transform([text1])

In [45]:
pr=model.predict(x)
if pr[0]==0:
    print("Depression")
else:
    print("Sucide")

Depression


In [46]:
#Sucide Thought
text2=''' It's been a year. An entire fucking year that I gave myself another chance. 
Or more like gave him another chance. Last year in October, I came close to killing myself. 
I had thought that nothing would ever get better and that there was absolutely no point living in a life that was FOR SURE never going to get better.
Yet, I gave God another chance. That morning, a pack of coyotes howled during the sunset as the last song in my playlist of songs I want played in my funeral had ended.
After hearing the coyotes, I thought it was a sign. A sign that God would make it better. 
'''

In [47]:
x=vect1.transform([text2])

In [48]:
pr=model.predict(x)
if pr[0]==0:
    print("Depression")
else:
    print("Sucide")

Sucide


## Saving The Model

In [49]:
import pickle

In [50]:
with open('count_vectorizer.pkl','wb') as f:
    pickle.dump(vect1,f)

In [51]:
with open('multinomialNB.pkl','wb') as f:
    pickle.dump(model,f)

### Conclusion:
* Model achived an highest accuracy of 72% 
* It Predicted the output correctly, but model accuracy needs to be improved.
* Accuracy can be improved by adding more text and cleaning the csv file properly.