In [1]:
import warnings
warnings.filterwarnings('ignore')

#libries for importing and performing operations on data
import numpy as np
import pandas as pd

# Machine Learning Libraries
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.feature_selection import SelectKBest,chi2,f_classif
from sklearn.ensemble import RandomForestClassifier,VotingClassifier,AdaBoostClassifier,GradientBoostingClassifier,BaggingClassifier
from sklearn.metrics import classification_report , confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

import pickle
import string

#libraries for Text Processing
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.stem import PorterStemmer


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
data = pd.read_csv('/content/Suicide_Detection.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,text,class
0,2,Ex Wife Threatening SuicideRecently I left my ...,suicide
1,3,Am I weird I don't get affected by compliments...,non-suicide
2,4,Finally 2020 is almost over... So I can never ...,non-suicide
3,8,i need helpjust help me im crying so hard,suicide
4,9,"I’m so lostHello, my name is Adam (16) and I’v...",suicide


# Data Preprocessing

In [4]:
data.shape

(232074, 3)

In [5]:
df = data.sample(n=10000, random_state=42)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 74414 to 224640
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  10000 non-null  int64 
 1   text        10000 non-null  object
 2   class       10000 non-null  object
dtypes: int64(1), object(2)
memory usage: 312.5+ KB


In [7]:
df.drop(columns = 'Unnamed: 0',inplace=True)

In [8]:
df.head()

Unnamed: 0,text,class
74414,I Don't know?7? Months self harm free and the ...,suicide
149516,I HAVE TO START BECOMING RICH I HAVE TO START ...,non-suicide
12484,"A poem (haiku) for u/Me-Game-Dev hi, hello hel...",non-suicide
14043,I've honestly got no idea what to do anymore.I...,suicide
30673,Do you ever just cry? Like you just think abou...,non-suicide


# Text Preprocessing

In [9]:
df['text']= df['text'].str.lower()

In [10]:
df['text'] = df['text'].str.replace(r'[^\w\s]+', '',regex = True)

In [11]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [12]:
df['text'] = df['text'].apply(lambda x:nltk.word_tokenize(x))

In [13]:
ps = PorterStemmer()
df['text'] = df['text'].apply(lambda x : [ps.stem(i) for i in x])

In [14]:
df['text']=df['text'].apply(lambda x : ' '.join(x))

In [15]:
df.head()

Unnamed: 0,text,class
74414,dont know7 month self harm free urg get strong...,suicide
149516,start becom rich start compani becom 16 afford...,non-suicide
12484,poem haiku umegamedev hi hello hello stop fuck...,non-suicide
14043,ive honestli got idea anymoreit feel everyon f...,suicide
30673,ever cri like think unfair life cri cant cri e...,non-suicide


In [16]:
# Savecleaned dataset.
df.to_csv('cleaned_data.csv')

In [17]:
dfnew = pd.read_csv('cleaned_data.csv')
dfnew.head()

Unnamed: 0.1,Unnamed: 0,text,class
0,74414,dont know7 month self harm free urg get strong...,suicide
1,149516,start becom rich start compani becom 16 afford...,non-suicide
2,12484,poem haiku umegamedev hi hello hello stop fuck...,non-suicide
3,14043,ive honestli got idea anymoreit feel everyon f...,suicide
4,30673,ever cri like think unfair life cri cant cri e...,non-suicide


In [18]:
ind = dfnew[dfnew['text'].isnull()].index

In [19]:
df.iloc[ind]

Unnamed: 0,text,class
102482,,suicide


In [20]:
dfnew.dropna(inplace=True)

# Machine Learning - Model Selection

In [21]:
x,y = dfnew['text'],dfnew['class']

## TF-IDF Vectorizer

In [22]:
vectorizer = TfidfVectorizer(min_df=50,max_features=5000)
x =  vectorizer.fit_transform(x).toarray()

In [23]:
# Save the model
with open('tfidf.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

In [24]:
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.30,random_state=5)

In [25]:
X_train.shape,X_test.shape

((6999, 1320), (3000, 1320))

## Naive Bayes (Voting Classifier)

In [26]:
nb = GaussianNB()
nb2 = BernoulliNB()
nb3 = MultinomialNB()
VotingClassifiers = VotingClassifier(estimators=[('GaussianNB', nb),('BernoulliNB',nb2), ('MultinomialNB', nb3)], voting = 'soft')
VotingClassifiers.fit(X_train, y_train)
print('Training score:',VotingClassifiers.score(X_train, y_train))
print('Testing score:',VotingClassifiers.score(X_test,y_test))

Training score: 0.899271324474925
Testing score: 0.8753333333333333


In [27]:
y_act=y_test
y_pred=VotingClassifiers.predict(X_test)
print(classification_report(y_act,y_pred))

              precision    recall  f1-score   support

 non-suicide       0.88      0.88      0.88      1542
     suicide       0.87      0.87      0.87      1458

    accuracy                           0.88      3000
   macro avg       0.88      0.88      0.88      3000
weighted avg       0.88      0.88      0.88      3000



## Gradient Boosting

In [28]:
model3 = RandomizedSearchCV(GradientBoostingClassifier(),{"learning_rate": range(3,5),
                "max_depth":[200],"max_features":range(6,10,2),
                 "n_estimators":[10]},random_state=8,n_jobs=-1)
model3.fit(X_train,y_train)
print('Training score:',model3.score(X_train,y_train))
print('Testing score:',model3.score(X_test,y_test))
model3.best_params_

Training score: 0.7601085869409916
Testing score: 0.738


{'n_estimators': 10, 'max_features': 8, 'max_depth': 200, 'learning_rate': 4}

In [29]:
#confusion matrix and classification report
y_act=y_test
y_pred=model3.predict(X_test)
print(classification_report(y_act,y_pred))

              precision    recall  f1-score   support

 non-suicide       0.79      0.67      0.73      1542
     suicide       0.70      0.81      0.75      1458

    accuracy                           0.74      3000
   macro avg       0.74      0.74      0.74      3000
weighted avg       0.74      0.74      0.74      3000



## XG Boost

In [30]:
model = XGBClassifier( eval_metric='map',max_depth=200,n_estimators=70,learning_rate=1.99)
model.fit(X_train,y_train.replace({"non-suicide":0,'suicide':1}))
print('Training score:',model.score(X_train,y_train.replace({"non-suicide":0,'suicide':1})))
print('Testing score:',model.score(X_test,y_test.replace({"non-suicide":0,'suicide':1})))

Training score: 0.8555507929704244
Testing score: 0.7883333333333333


In [31]:
#matrix
y_act = y_test.replace({"non-suicide":0,'suicide':1})
y_pred = model.predict(X_test)
print(classification_report(y_act,y_pred))

              precision    recall  f1-score   support

           0       0.77      0.83      0.80      1542
           1       0.81      0.74      0.77      1458

    accuracy                           0.79      3000
   macro avg       0.79      0.79      0.79      3000
weighted avg       0.79      0.79      0.79      3000



# Logistic Regression

In [32]:
from sklearn.linear_model import LogisticRegression
logistic_regression = LogisticRegression()
logistic_regression.fit(X_train, y_train)

y_pred = logistic_regression.predict(X_test)


In [33]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

 non-suicide       0.89      0.93      0.91      1542
     suicide       0.92      0.88      0.90      1458

    accuracy                           0.91      3000
   macro avg       0.91      0.90      0.91      3000
weighted avg       0.91      0.91      0.91      3000



# Label Encoding

In [46]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)



In [47]:
print(set(y_train_encoded))


{0, 1}


#CNN

In [48]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences



tokenizer = Tokenizer()
tokenizer.fit_on_texts(dfnew['text'])
X_sequence = tokenizer.texts_to_sequences(dfnew['text'])
X_padded = pad_sequences(X_sequence)             # Padding sequences to make of same length
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

# Creating Model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=X_padded.shape[1]))
model.add(Conv1D(64, 3, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(1, activation='sigmoid'))

# Compiling model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Fitting of the model
model.fit(X_train, y_train_encoded, epochs=5, batch_size=64, validation_split=0.1)



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7b12e75f8970>

In [49]:
# save the Model
with open('best_model.pkl', 'wb') as f:
    pickle.dump(logistic_regression, f)

In [50]:
def preprocess(inp):
    inp = inp.lower() #convert to lower case
    inp = inp.replace(r'[^\w\s]+', '') #remove punctuations
    inp = [word for word in inp.split() if word not in (stop_words)] #tokenize the sentence
    inp = ' '.join([ps.stem(i) for i in inp]) #stremming
    inputToModel = vectorizer.transform([inp]).toarray() #transform to vector form
    return inputToModel

In [51]:
def app(input_text):
    # Define the input text box
    print('Input : ',input_text) #take input from user
    processed_array = preprocess(input_text) #preprocess the text
    predict = VotingClassifiers.predict(processed_array) #Model prediction
    print('Output : ', predict[0])

In [52]:
app('I am fetched up with my life i do not want to live anymore')

Input :  I am fetched up with my life i do not want to live anymore
Output :  suicide


In [53]:
app('poem haiku umegamedev hi hello hello stop fuck.')

Input :  poem haiku umegamedev hi hello hello stop fuck.
Output :  non-suicide


In [54]:

import sklearn
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer


stop_words = stopwords.words('english')
# better file handling needed
with open('tfidf.pkl', 'rb') as f:
    tfidf = pickle.load(f)

def preprocess(inp):
    inp = inp.lower()
    inp = inp.replace(r'[^\w\s]+', '')
    inp = [word for word in inp.split() if word not in (stop_words)]

    ps = PorterStemmer()
    inp = ' '.join([ps.stem(i) for i in inp])
    inputToModel = tfidf.transform([inp]).toarray()
    return inputToModel

In [55]:
%%writefile app.py
import streamlit as st
import sklearn
import pickle
import sklearn
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

stop_words = stopwords.words('english')
# better file handling needed
with open('tfidf.pkl', 'rb') as f:
    tfidf = pickle.load(f)

def preprocess(inp):
    inp = inp.lower()
    inp = inp.replace(r'[^\w\s]+', '')
    inp = [word for word in inp.split() if word not in (stop_words)]

    ps = PorterStemmer()
    inp = ' '.join([ps.stem(i) for i in inp])
    inputToModel = tfidf.transform([inp]).toarray()
    return inputToModel

# Load the pre-trained model
with open('best_model.pkl', 'rb') as f:
    model = pickle.load(f)

print('Done loading')
def detection(input_text):
    processed_array = preprocess(input_text)
    prediction = model.predict(processed_array)

    if prediction[0] == 'suicide':
        st.write("The text contain references to self harm...\n")
        st.write("As your well wisher I recommend you to talk to a therapist or call Helpline 988.")


    elif prediction[0] == 'non-suicide':
        st.write(" It seems like your statement does not indicate self-harm.")

    else:
        st.write(" I couldn't make a clear prediction. Please provide more information.")


# Set the app title and heading
st.set_page_config(page_title='Suicide-Detection App', layout='wide')
st.title('Suicide Ideation detection')

# Define the input text box
input_text = st.text_input(' Hi, How can I help you?')

# Check if the user has entered a statement
if input_text:
    st.write(f"User - \"{input_text}\"")
    detection(input_text)

# Define the predict button
if st.button('predict'):
    st.write("Thank you for using me.")




Overwriting app.py


In [None]:
!streamlit run app.py & npx localtunnel --port 8501

[?25l[..................] / rollbackFailedOptional: verb npm-session 3eeb1d3d0f350f2[0m[K[..................] / rollbackFailedOptional: verb npm-session 3eeb1d3d0f350f2[0m[K[..................] / rollbackFailedOptional: verb npm-session 3eeb1d3d0f350f2[0m[K[..................] / rollbackFailedOptional: verb npm-session 3eeb1d3d0f350f2[0m[K
Collecting usage statistics. To deactivate, set browser.gatherUsageStats to False.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://104.196.26.85:8501[0m
[0m
[K[?25hnpx: installed 22 in 3.072s
your url is: https://red-eyes-design.loca.lt
