In [1]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv("https://raw.githubusercontent.com/laxmimerit/twitter-suicidal-intention-dataset/master/twitter-suicidal_data.csv")
df.head()

Unnamed: 0,tweet,intention
0,my life is meaningless i just want to end my l...,1
1,muttering i wanna die to myself daily for a fe...,1
2,work slave i really feel like my only purpose ...,1
3,i did something on the 2 of october i overdose...,1
4,i feel like no one cares i just want to die ma...,1


In [4]:
df['intention'].value_counts()

0    5121
1    3998
Name: intention, dtype: int64

<h3>Preprocessing</h3>

In [6]:
import re
def clean_tweets(text):
    text = re.sub('http\S+\s*', ' ', text) #Remove URls
    text = re.sub(r'@[A-Za-z0-9]+','',text) #Removing @ mentions
    text = re.sub(r'#','',text) #Removing the hashtag symbol
    text = re.sub(r'RT[\s]+','',text) #Removing RT
    text = re.sub("(.)\\1{2,}", "\\1", text) #Remove repeating characters
    
    return text

In [7]:
df['tweet'] = df['tweet'].apply(lambda x:clean_tweets(x))

In [8]:
df.head()

Unnamed: 0,tweet,intention
0,my life is meaningless i just want to end my l...,1
1,muttering i wanna die to myself daily for a fe...,1
2,work slave i really feel like my only purpose ...,1
3,i did something on the 2 of october i overdose...,1
4,i feel like no one cares i just want to die ma...,1


In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn import metrics
#Taking top 20000 dictionary words into account
#unigram, bigram, trigram ---> there is a single word, combination of 2 words, combination of 3 words
#character analyzer ---> character by character tokenization
text_clf = Pipeline([('word_vectorizer', TfidfVectorizer(max_features = 20000, ngram_range=(1,3), analyzer='char')),
                    ('clf', LinearSVC())])

print ("Feature completed .....")

Feature completed .....


In [30]:
X_train,X_test,y_train,y_test = train_test_split(df['tweet'],df['intention'],random_state=0, test_size=0.2)
print(X_train.shape)
print(X_test.shape)

(7295,)
(1824,)


In [31]:
text_clf.fit(X_train,y_train)
prediction = text_clf.predict(X_test)

In [32]:
print("\n Classification report for classifier %s:\n%s\n" % (clf, metrics.classification_report(y_test, prediction)))


 Classification report for classifier LinearSVC():
              precision    recall  f1-score   support

           0       0.94      0.93      0.94      1060
           1       0.91      0.92      0.91       764

    accuracy                           0.93      1824
   macro avg       0.92      0.93      0.93      1824
weighted avg       0.93      0.93      0.93      1824




In [38]:
test_ans = '''I don't feel like I can stand anymore stress in my life. 
                I want to end it for once and all.'''
text_clf.predict([test_ans])[0]

1

In [48]:
test_ans_pos = '''I wish tremendous joy and good health to you and your family.'''
text_clf.predict([test_ans_pos])[0]

0

In [35]:
import pickle

In [42]:
with open('suicide_tendency_model', 'wb') as to_write:
    pickle.dump(text_clf, to_write)