# Import required packages

In [124]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import VarianceThreshold
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM, Convolution1D
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
from matplotlib import pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

import seaborn as sns
# Word2vec
import gensim

# Utility
import ssl
import re
import numpy as np
import os
from collections import Counter
import logging
import time
import pickle
import itertools



In [125]:
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
    
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/santhosh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [126]:
# DATASET
DATASET_COLUMNS = ["message", "label"]
DATASET_ENCODING = "ISO-8859-1"
TRAIN_SIZE = 0.8

# TEXT CLEANING
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

# KERAS
SEQUENCE_LENGTH = 300
EPOCHS = 8
BATCH_SIZE = 1024

# EXPORT
KERAS_MODEL = "model.h5"
WORD2VEC_MODEL = "model.w2v"
TOKENIZER_MODEL = "tokenizer.pkl"
ENCODER_MODEL = "encoder.pkl"

In [127]:
df = pd.read_csv('d1.csv')
print(df.iloc[:10])

   Unnamed: 0                                            message  label
0           0  The lack of this understanding is a small but ...      1
1           1  i just told my parents about my depression and...      1
2           2  depression is something i don't speak about ev...      1
3           3  Made myself a tortilla filled with pb&j. My de...      1
4           4  @WorldofOutlaws I am gonna need depression med...      1
5           5  my anxiety and my depression fighting over who...      1
6           6  wow she's suddenly cured my depression and gav...      1
7           7  I am officially done with @kanyewest. him, the...      1
8           8  Me: what's wrong?My girl: *looks up at me with...      1
9           9  @AusBorderForce @PeterDutton_MP @shanebazzi Ag...      1


**Preprocessing**

In [128]:
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

In [129]:
def preprocess(text, stem=True):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)


In [130]:
print(preprocess("buses", stem = False))

buses


In [131]:
print(df.head())
df.message = df.message.apply(lambda x: preprocess(x))

print("after preprocessing")
print(df.head())


   Unnamed: 0                                            message  label
0           0  The lack of this understanding is a small but ...      1
1           1  i just told my parents about my depression and...      1
2           2  depression is something i don't speak about ev...      1
3           3  Made myself a tortilla filled with pb&j. My de...      1
4           4  @WorldofOutlaws I am gonna need depression med...      1
after preprocessing
   Unnamed: 0                                            message  label
0           0  lack understand small signific part caus anxie...      1
1           1  told parent depress hard get gen x peopl under...      1
2           2  depress someth speak even go also doubl edg sw...      1
3           3  made tortilla fill pb j depress cure olivia 1 ...      1
4           4  gonna need depress med soon rainout spin equil...      1


In [132]:
callbacks = [ ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
              EarlyStopping(monitor='val_accuracy', min_delta=1e-4, patience=5)]

**TF-IDF**

In [133]:
vectorizer = TfidfVectorizer(use_idf=True)

In [134]:
texts = df.message.to_numpy()
labels = df.label.to_numpy()

In [135]:

# Split the dataset into train and test sets
x_train, x_test, y_train, y_test = train_test_split(texts, labels, test_size=0.1, random_state=42)

In [136]:
print(x_train.shape, y_train.shape)
print(type(x_train))

(7339,) (7339,)
<class 'numpy.ndarray'>


In [137]:
vectorizer.fit(x_train)

In [138]:
X_train_tfidf = vectorizer.transform(x_train).toarray()
X_test_tfidf = vectorizer.transform(x_test).toarray()

In [139]:
X_train_tfidf.shape

(7339, 10058)

In [140]:
constant_filter = VarianceThreshold(threshold = 0.001)
constant_filter.fit(X_train_tfidf)

x_train_filter = constant_filter.transform(X_train_tfidf)
x_test_filter = constant_filter.transform(X_test_tfidf)


y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)

embedding_weights = np.array([vectorizer.idf_])
embedding_weights_filter = constant_filter.transform(embedding_weights)

new_embedding_matrix = np.zeros((2, embedding_weights_filter.shape[1]))
new_embedding_matrix[:-1,:] = embedding_weights_filter
embedding_weights_filter = new_embedding_matrix

max_len = x_train_filter.shape[1]

In [152]:
print(X_train_tfidf.shape, X_test_tfidf.shape)
print(x_train_filter.shape, x_test_filter.shape)
print(y_train.shape, y_test.shape)

print(embedding_weights.shape, embedding_weights_filter.shape)



(7339, 10058) (816, 10058)
(7339, 132) (816, 132)
(8155, 1) (816, 1)
(1, 10058) (2, 132)


In [158]:

embedding_layer = Embedding(input_dim=2, output_dim=max_len, input_length=max_len, weights=[embedding_weights_filter], trainable=False)

In [155]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df.message)

vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size)
x_train = pad_sequences(tokenizer.texts_to_sequences(df.message), maxlen=max_len)
x_test = pad_sequences(tokenizer.texts_to_sequences(df.message), maxlen=max_len)


encoder = LabelEncoder()
encoder.fit(df.label.tolist())

y_train = encoder.transform(df.label.tolist())

y_train = y_train.reshape(-1,1)

print(x_train.shape, y_train.shape)

Total words 10408
(8155, 132) (8155, 1)


In [160]:
model = Sequential()
model.add(embedding_layer)
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])

In [157]:
model.summary()

Model: "sequential_11"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 132, 132)          264       
                                                                 
 lstm_6 (LSTM)               (None, 128)               133632    
                                                                 
 dense_26 (Dense)            (None, 1)                 129       
                                                                 
Total params: 134,025
Trainable params: 133,761
Non-trainable params: 264
_________________________________________________________________


(8155, 132)

In [161]:
history = model.fit(x_train, y_train,
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHS,
                    verbose=1,
                    callbacks=callbacks)

Epoch 1/8


InvalidArgumentError: Graph execution error:

Detected at node 'sequential_12/embedding_8/embedding_lookup' defined at (most recent call last):
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/runpy.py", line 196, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/runpy.py", line 86, in _run_code
      exec(code, run_globals)
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/traitlets/config/application.py", line 1043, in launch_instance
      app.start()
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 725, in start
      self.io_loop.start()
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 215, in start
      self.asyncio_loop.run_forever()
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/asyncio/base_events.py", line 595, in run_forever
      self._run_once()
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/asyncio/base_events.py", line 1881, in _run_once
      handle._run()
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/asyncio/events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 513, in dispatch_queue
      await self.process_one()
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 502, in process_one
      await dispatch(*args)
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 409, in dispatch_shell
      await result
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 729, in execute_request
      reply_content = await reply_content
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 422, in do_execute
      res = shell.run_cell(
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 540, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 2961, in run_cell
      result = self._run_cell(
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3016, in _run_cell
      result = runner(coro)
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3221, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3400, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3460, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/var/folders/mk/pyhcyg_97b58xlkt1n3p6xw00000gn/T/ipykernel_33587/403336601.py", line 1, in <module>
      history = model.fit(x_train, y_train,
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/keras/engine/training.py", line 1650, in fit
      tmp_logs = self.train_function(iterator)
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/keras/engine/training.py", line 1249, in train_function
      return step_function(self, iterator)
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/keras/engine/training.py", line 1233, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/keras/engine/training.py", line 1222, in run_step
      outputs = model.train_step(data)
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/keras/engine/training.py", line 1023, in train_step
      y_pred = self(x, training=True)
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/keras/engine/training.py", line 561, in __call__
      return super().__call__(*args, **kwargs)
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/keras/engine/base_layer.py", line 1132, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/keras/engine/sequential.py", line 413, in call
      return super().call(inputs, training=training, mask=mask)
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/keras/engine/functional.py", line 511, in call
      return self._run_internal_graph(inputs, training=training, mask=mask)
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/keras/engine/functional.py", line 668, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/keras/engine/base_layer.py", line 1132, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/keras/layers/core/embedding.py", line 208, in call
      out = tf.nn.embedding_lookup(self.embeddings, inputs)
Node: 'sequential_12/embedding_8/embedding_lookup'
indices[704,126] = 3284 is not in [0, 2)
	 [[{{node sequential_12/embedding_8/embedding_lookup}}]] [Op:__inference_train_function_48196]

In [119]:
loss, accuracy = model.evaluate(x_test_filter, y_test, batch_size=1)
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)

Test Loss: 0.67059326171875
Test Accuracy: 0.6088154315948486


In [120]:
labels_pred = model.predict(x_test_filter)
labels_pred = np.round(labels_pred.flatten())
print(classification_report(y_test, labels_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       142
           1       0.61      1.00      0.76       221

    accuracy                           0.61       363
   macro avg       0.30      0.50      0.38       363
weighted avg       0.37      0.61      0.46       363



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [121]:
print(len(y_train[y_train == 0]))
print(len(y_train[y_train == 1]))

1172
2092


In [122]:
def predict(text, constant_filter = None):
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            tokens.append(stemmer.stem(token))
    text = " ".join(tokens)
    text = np.array([text])
    text = vectorizer.transform(text)
    text_filter = constant_filter.transform(text)
    return model.predict(text_filter.toarray())

In [123]:
predict("depressive life is so hateful, i wanna suicide", constant_filter)



array([[0.6336455]], dtype=float32)