In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

In [4]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier

In [53]:
from sklearn.metrics import confusion_matrix,classification_report

In [13]:
df=pd.read_csv('../Datasets/combined.csv')

In [14]:
df.head()

Unnamed: 0,subreddit,selftext,title,name,is_sucide
0,SuicideWatch,We've been seeing a worrying increase in pro-s...,New wiki on how to avoid accidentally encourag...,t3_cz6nfd,1
1,SuicideWatch,"If you want to recognise an occasion, please d...",Reminder: Absolutely no activism of any kind i...,t3_d2370x,1
2,SuicideWatch,,"sleep just isn't sleep anymore, it's an escape.",t3_gosr7c,1
3,SuicideWatch,"I just want to go to sleep, and never wake up",Fuck me,t3_gomrql,1
4,SuicideWatch,Seriously going through some existential stuff...,What is the point of living in this corrupt ce...,t3_gow0e9,1


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1954 entries, 0 to 1953
Data columns (total 5 columns):
subreddit    1954 non-null object
selftext     1871 non-null object
title        1954 non-null object
name         1954 non-null object
is_sucide    1954 non-null int64
dtypes: int64(1), object(4)
memory usage: 76.5+ KB


In [16]:
df.isna().sum()

subreddit     0
selftext     83
title         0
name          0
is_sucide     0
dtype: int64

## Cleaning Text

In [17]:
df['selftext'].fillna('no text',inplace=True)

In [19]:
wordnet=WordNetLemmatizer()

In [20]:
def clean_text(a):
    x=re.sub('[^a-zA-Z]',' ',a)
    x=x.lower()
    text=nltk.word_tokenize(x)
    text=[wordnet.lemmatize(word) for word in text if word not in stopwords.words('english')]
    text=' '.join(text)
    return text

In [21]:
df['selftext']=df['selftext'].apply(clean_text)

In [22]:
df.head()

Unnamed: 0,subreddit,selftext,title,name,is_sucide
0,SuicideWatch,seeing worrying increase pro suicide content s...,New wiki on how to avoid accidentally encourag...,t3_cz6nfd,1
1,SuicideWatch,want recognise occasion please offering extra ...,Reminder: Absolutely no activism of any kind i...,t3_d2370x,1
2,SuicideWatch,text,"sleep just isn't sleep anymore, it's an escape.",t3_gosr7c,1
3,SuicideWatch,want go sleep never wake,Fuck me,t3_gomrql,1
4,SuicideWatch,seriously going existential stuff live society...,What is the point of living in this corrupt ce...,t3_gow0e9,1


In [23]:
df['selftext'][25]

'sit rain wet cold demon head fight gold watch tiny wet droplet fall wind guide shove rain fall pitter patter hit ground splatter oh wish could matter little anyone care let set free people tell worth thing keeping gravity tie earth evil thought constantly churning people tell need go school keep learning focus feel hopeless always distracted others reacted whirling hurricane inside brain present everlasting pain breath breath thinking death one day might commit one day might quit everyday hide behind mask hoping someone might ask hey okay oh mark leg arm well self harm society driven u finally see made walk open pleading silence everyone seems ignore thinking others give guidance sit rain wet cold demon head fight gold'

## Embedding With Bag Of Words and TFIDF 

In [32]:
vect1=CountVectorizer(ngram_range=(1,3))

In [33]:
vect2=TfidfVectorizer(ngram_range=(1,3))

In [34]:
mat1=vect1.fit_transform(df['selftext'])

In [35]:
mat2=vect2.fit_transform(df['selftext'])

## Checking Accuracy with Bag of Words

In [38]:
x_train,x_test,y_train,y_test=train_test_split(mat1,df['is_sucide'],random_state=42,test_size=0.2)

In [40]:
cv=ShuffleSplit(n_splits=5,test_size=0.2,random_state=42)

In [42]:
cross_val_score(RandomForestClassifier(),x_train,y_train,cv=cv,scoring='accuracy')

array([0.62619808, 0.6485623 , 0.62619808, 0.63578275, 0.5942492 ])

In [43]:
cross_val_score(MultinomialNB(),x_train,y_train,cv=cv,scoring='accuracy')

array([0.68370607, 0.67731629, 0.63897764, 0.63897764, 0.59744409])

In [44]:
cross_val_score(XGBClassifier(),x_train,y_train,cv=cv,scoring='accuracy')

array([0.63258786, 0.7028754 , 0.65814696, 0.66453674, 0.62300319])

## Checking Accuracy With IFIDF

In [45]:
x_train1,x_test1,y_train1,y_test1=train_test_split(mat2,df['is_sucide'],random_state=42,test_size=0.2)

In [46]:
cross_val_score(RandomForestClassifier(),x_train1,y_train1,cv=cv,scoring='accuracy')

array([0.6485623 , 0.63897764, 0.66773163, 0.61341853, 0.58785942])

In [47]:
cross_val_score(MultinomialNB(),x_train1,y_train1,cv=cv,scoring='accuracy')

array([0.61661342, 0.62300319, 0.63897764, 0.65495208, 0.62619808])

In [48]:
cross_val_score(XGBClassifier(),x_train1,y_train1,cv=cv,scoring='accuracy')

array([0.66453674, 0.68051118, 0.67092652, 0.64536741, 0.63258786])

### XGBOOST with TFIDF Vectorizer Gives a good Accuracy 

In [58]:
model=XGBClassifier()

In [59]:
model.fit(x_train1,y_train1)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              validate_parameters=False, verbosity=None)

In [60]:
prediction=model.predict(x_test)

In [61]:
confusion_matrix(y_test,prediction)

array([[128,  60],
       [ 90, 113]])

In [62]:
print(classification_report(y_test,prediction))

              precision    recall  f1-score   support

           0       0.59      0.68      0.63       188
           1       0.65      0.56      0.60       203

    accuracy                           0.62       391
   macro avg       0.62      0.62      0.62       391
weighted avg       0.62      0.62      0.62       391



### Predicting With New Values

In [57]:
#Depression Thought
text1='''I don't know, it's like this unexplainable feeling. 
Like this physical feeling in my heart that pulsates throughout my whole being. 
Whenever I hear happy music or watch a cheesy movie where everybody's happy I just can't help but feel sad. 
Same thing if I see something from when I was a kid or something similar. 
I'm thinking the root of it may be I've felt like I've missed out on life or didn't make the most of it? 
I don't know. I'm just hoping it's not just me.'''

In [64]:
x=vect2.transform([text1])

In [69]:
pr=model.predict(x)
if pr[0]==0:
    print("Depression")
else:
    print("Sucide")

Depression


In [70]:
#Sucide Thought
text2=''' It's been a year. An entire fucking year that I gave myself another chance. 
Or more like gave him another chance. Last year in October, I came close to killing myself. 
I had thought that nothing would ever get better and that there was absolutely no point living in a life that was FOR SURE never going to get better.
Yet, I gave God another chance. That morning, a pack of coyotes howled during the sunset as the last song in my playlist of songs I want played in my funeral had ended.
After hearing the coyotes, I thought it was a sign. A sign that God would make it better. 
'''

In [71]:
x=vect2.transform([text2])

In [72]:
pr=model.predict(x)
if pr[0]==0:
    print("Depression")
else:
    print("Sucide")

Sucide


## Saving The Model

In [73]:
import pickle

In [74]:
with open('tfidf_vectorizer.pkl','wb') as f:
    pickle.dump(vect2,f)

In [75]:
with open('xgb.pkl','wb') as f:
    pickle.dump(model,f)

### Conclusion:
* Model achived an highest accuracy of 62% which is less than accuracy achived by LSTM i.e (66.64%)
* It Predicted the output correctly, but model accuracy needs to be improved
* Accuracy can be improved by adding more text and cleaning the csv file properly