# Import required packages

In [2]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import VarianceThreshold
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM, Convolution1D
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
from matplotlib import pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

import seaborn as sns
# Word2vec
import gensim

# Utility
import ssl
import re
import numpy as np
import os
from collections import Counter
import logging
import time
import pickle
import itertools



In [3]:
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
    
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/santhosh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
# DATASET
DATASET_COLUMNS = ["message", "label"]
DATASET_ENCODING = "ISO-8859-1"
TRAIN_SIZE = 0.8

# TEXT CLEANING
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

# KERAS
SEQUENCE_LENGTH = 300
EPOCHS = 8
BATCH_SIZE = 1024

# EXPORT
KERAS_MODEL = "model.h5"
WORD2VEC_MODEL = "model.w2v"
TOKENIZER_MODEL = "tokenizer.pkl"
ENCODER_MODEL = "encoder.pkl"

In [10]:
df = pd.read_csv('d1.csv')
print(df.iloc[:10])

   Unnamed: 0                                            message  label
0           0  The lack of this understanding is a small but ...      1
1           1  i just told my parents about my depression and...      1
2           2  depression is something i don't speak about ev...      1
3           3  Made myself a tortilla filled with pb&j. My de...      1
4           4  @WorldofOutlaws I am gonna need depression med...      1
5           5  my anxiety and my depression fighting over who...      1
6           6  wow she's suddenly cured my depression and gav...      1
7           7  I am officially done with @kanyewest. him, the...      1
8           8  Me: what's wrong?My girl: *looks up at me with...      1
9           9  @AusBorderForce @PeterDutton_MP @shanebazzi Ag...      1


**Preprocessing**

In [11]:
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

In [12]:
def preprocess(text, stem=True):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)


In [13]:
print(preprocess("buses", stem = False))

buses


In [14]:
print(df.head())
df.message = df.message.apply(lambda x: preprocess(x))

print("after preprocessing")
print(df.head())


   Unnamed: 0                                            message  label
0           0  The lack of this understanding is a small but ...      1
1           1  i just told my parents about my depression and...      1
2           2  depression is something i don't speak about ev...      1
3           3  Made myself a tortilla filled with pb&j. My de...      1
4           4  @WorldofOutlaws I am gonna need depression med...      1
after preprocessing
   Unnamed: 0                                            message  label
0           0  lack understand small signific part caus anxie...      1
1           1  told parent depress hard get gen x peopl under...      1
2           2  depress someth speak even go also doubl edg sw...      1
3           3  made tortilla fill pb j depress cure olivia 1 ...      1
4           4  gonna need depress med soon rainout spin equil...      1


In [16]:
callbacks = [ ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
              EarlyStopping(monitor='val_accuracy', min_delta=1e-4, patience=5)]

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'b', label='Training acc')
plt.plot(epochs, val_acc, 'r', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'b', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
score = model.evaluate(x_test, y_test, batch_size=BATCH_SIZE)
print(score)
print("LSTM\nACCURACY:",score[1])
print("LOSS:",score[0])

In [None]:
labels_pred = model.predict(x_test)
labels_pred = np.round(labels_pred.flatten())
accuracy = accuracy_score(y_test, labels_pred)
print("Accuracy: %.2f%%" % (accuracy*100))

In [None]:
report = classification_report(y_test, labels_pred, output_dict=True)
print(report)

In [None]:
data = [['precision', report['1']['precision']], ['recall', report['1']['recall']], ['accuracy', report['accuracy']], ['f1-score', report['1']['f1-score']]]
df_lstm = pd.DataFrame(data, columns=['cat', 'val'])
df_lstm['source'] = 'lstm'
df_lstm.head()

**TF-IDF**

In [17]:
vectorizer = TfidfVectorizer(use_idf=True)

In [18]:
texts = df.message.to_numpy()
labels = df.label.to_numpy()

In [19]:

# Split the dataset into train and test sets
x_train, x_test, y_train, y_test = train_test_split(texts, labels, test_size=0.1, random_state=42)

In [20]:
print(x_train.shape, y_train.shape)
print(type(x_train))

(7339,) (7339,)
<class 'numpy.ndarray'>


In [21]:
vectorizer.fit(x_train)

In [22]:
X_train_tfidf = vectorizer.transform(x_train).toarray()
X_test_tfidf = vectorizer.transform(x_test).toarray()

In [23]:
X_train_tfidf.shape

(7339, 10058)

In [29]:
constant_filter = VarianceThreshold(threshold = 0.0002)
constant_filter.fit(X_train_tfidf)

x_train_filter = constant_filter.transform(X_train_tfidf)
x_test_filter = constant_filter.transform(X_test_tfidf)


y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)

embedding_weights = np.array([vectorizer.idf_])
embedding_weights_filter = constant_filter.transform(embedding_weights)

new_embedding_matrix = np.zeros((2, embedding_weights_filter.shape[1]))
new_embedding_matrix[:-1,:] = embedding_weights_filter
embedding_weights_filter = new_embedding_matrix

max_len = x_train_filter.shape[1]

In [30]:
print(X_train_tfidf.shape, X_test_tfidf.shape)
print(x_train_filter.shape, x_test_filter.shape)
print(y_train.shape, y_test.shape)

print(embedding_weights.shape, embedding_weights_filter.shape)




(7339, 10058) (816, 10058)
(7339, 1040) (816, 1040)
(7339, 1) (816, 1)
(1, 10058) (2, 1040)


In [31]:

embedding_layer = Embedding(input_dim=2, output_dim=max_len, input_length=max_len, weights=[embedding_weights_filter], trainable=False)

In [32]:
model = Sequential()
model.add(embedding_layer)
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [33]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 1040, 1040)        2080      
                                                                 
 lstm_1 (LSTM)               (None, 128)               598528    
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 600,737
Trainable params: 598,657
Non-trainable params: 2,080
_________________________________________________________________


In [34]:
history = model.fit(x_train_filter, y_train,
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHS,
                    validation_data=[x_test_filter, y_test],
                    verbose=1,
                    callbacks=callbacks)

Epoch 1/8


2023-04-21 23:04:05.939758: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8


In [155]:
loss, accuracy = model.evaluate(x_test_filter, y_test, batch_size=1)
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)

Test Loss: 0.6758608222007751
Test Accuracy: 0.6020846366882324


In [35]:
labels_pred = model.predict(x_test_filter)
labels_pred = np.round(labels_pred.flatten())
print(classification_report(y_test, labels_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       323
           1       0.60      1.00      0.75       493

    accuracy                           0.60       816
   macro avg       0.30      0.50      0.38       816
weighted avg       0.37      0.60      0.46       816



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [161]:
len(y_train[y_train == 0])
len(y_train[y_train == 1])

3676