Importing Libraries

In [None]:
from wordcloud import WordCloud
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
!pip install neattext
import neattext.functions as nfx
import matplotlib.pyplot as plt
import plotly.express as plx
from sklearn.metrics import classification_report
import keras
from keras.layers import Embedding,Dense,LSTM,Bidirectional,GlobalMaxPooling1D,Input,Dropout
from keras.callbacks import EarlyStopping,ReduceLROnPlateau
from keras.models import Sequential
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from sklearn.metrics import confusion_matrix
from tqdm import tqdm
import seaborn as sns
import pickle
import warnings
warnings.filterwarnings('ignore')

In [None]:
!pip install Keras-Preprocessing

Importing dataset

In [None]:
data=pd.read_csv('Suicide_Mentalhealth_Detection.csv')
data.head()

In [None]:
data['class'].value_counts()

In [None]:
data['class'].value_counts().index.values

Splitting Dataset Into Train And Test

In [None]:
train_data,test_data=train_test_split(data,test_size=0.2,random_state=10)

In [None]:
train_data['class'].value_counts().index.values

In [None]:
plx.bar(train_data,x=train_data['class'].value_counts().index.values,
        y=train_data['class'].value_counts(),color=['Suicide','Not Suicide'])

Data Cleaning

In [None]:
def clean_text(text):
    text_length=[]
    cleaned_text=[]
    for sent in tqdm(text):
        sent=sent.lower()
        sent=nfx.remove_special_characters(sent)
        sent=nfx.remove_stopwords(sent)
        text_length.append(len(sent.split()))
        cleaned_text.append(sent)
    return cleaned_text,text_length

In [None]:
train_data

In [None]:
train_data.text

In [None]:
cleaned_train_text,train_text_length=clean_text(train_data.text)
cleaned_test_text,test_text_length=clean_text(test_data.text)

In [None]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(cleaned_train_text)

In [None]:
word_freq=pd.DataFrame(tokenizer.word_counts.items(),columns=['word','count']).sort_values(by='count',ascending=False)

Preprocessing

In [None]:
train_text_seq=tokenizer.texts_to_sequences(cleaned_train_text)
train_text_pad=pad_sequences(train_text_seq,maxlen=50)


test_text_seq=tokenizer.texts_to_sequences(cleaned_test_text)
test_text_pad=pad_sequences(test_text_seq,maxlen=50)

In [None]:
lbl_target=LabelEncoder()
train_output=lbl_target.fit_transform(train_data['class'])
test_output=lbl_target.transform(test_data['class'])

In [None]:
train_output

Embeddings

In [None]:

with open('/content/drive/MyDrive/Colab Notebooks/glove.840B.300d.pkl', 'rb') as fp:
    glove_embedding = pickle.load(fp)

In [None]:
v=len(tokenizer.word_index)

embedding_matrix=np.zeros((v+1,300), dtype=float)
for word,idx in tokenizer.word_index.items():
    embedding_vector=glove_embedding.get(word)
    if embedding_vector is not None:
        embedding_matrix[idx]=embedding_vector

In [None]:
early_stop=EarlyStopping(patience=5)
reducelr=ReduceLROnPlateau(patience=3)

Model Training And Testing

In [None]:
model=Sequential()
model.add(Input(shape=(50,)))
model.add(Embedding(v+1,300,weights=[embedding_matrix],trainable=False))
model.add(LSTM(20,return_sequences=True))
model.add(GlobalMaxPooling1D())
model.add(Dense(256,activation='relu'))
model.add(Dense(1,activation='sigmoid'))
model.compile(optimizer=keras.optimizers.SGD(0.1,momentum=0.09),loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
r=model.fit(train_text_pad,train_output,validation_data=(test_text_pad,test_output),
            epochs=20,batch_size=256,callbacks=[early_stop,reducelr])

In [None]:
test_output

In [None]:
y_pred = model.predict(test_text_pad)

In [None]:
y_pred = (y_pred > 0.5)
y_pred = 1*y_pred

y_pred

In [None]:
tn, fp, fn, tp = confusion_matrix(test_output, y_pred).ravel()
false_positive_rate = fp / (fp + tn)

print("False Positive Rate:", false_positive_rate)

In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(test_output, y_pred)

print(accuracy)

In [None]:
len(y_pred)


In [None]:
len(test_output)

In [None]:
model.save('model.h5')

In [None]:
tokenizer.save('tokenize.h5')

In [None]:
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [None]:
with open('token.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

In [None]:
text = ["i am feeling lonely"]

In [None]:
cleaned_text,text_length=clean_text(text)
text_seq=tokenizer.texts_to_sequences(cleaned_text)
text_pad=pad_sequences(text_seq,maxlen=50)

In [None]:
model.predict(text_pad)

In [None]:
lbl_target.fit_transform(model.predict(text_pad))

In [None]:
model.accuracy()