<a href="https://colab.research.google.com/github/Samar-Agarwal/Detecting-Depression-through-Tweets/blob/main/npl_wids_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.layers import Dense, Embedding, LSTM
from keras.utils.np_utils import to_categorical
import re

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
data =pd.read_csv("drive/MyDrive/WIDS_NLP_Project/dataset2.csv", on_bad_lines = 'skip')
# Keeping only the neccessary columns
data = data[['Sentiment', 'SentimentText']]

In [5]:
data['SentimentText'] = data['SentimentText'].apply(lambda x: x.lower()) # lowering all alphabets
data['SentimentText'] = data['SentimentText'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))  #using re module for stooping

print(data[ data['Sentiment'] == 1].size)
print(data[ data['Sentiment'] == 0].size)

1580354
1576870


In [6]:
max_fatures = 20000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['SentimentText'].values)
X = tokenizer.texts_to_sequences(data['SentimentText'].values)
X = pad_sequences(X)

In [7]:
import gensim

In [9]:
documents = [_text.split() for _text in data.SentimentText]

In [10]:
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10
w2v_model = gensim.models.word2vec.Word2Vec(size=W2V_SIZE, 
                                            window=W2V_WINDOW, 
                                            min_count=W2V_MIN_COUNT, 
                                            workers=12)


w2v_model.build_vocab(documents)

In [11]:
words = w2v_model.wv.vocab.keys()
vocab_size = len(words)
print("Vocab size", vocab_size)

Vocab size 45138


In [12]:
import tensorflow as tf
tf.test.gpu_device_name()

'/device:GPU:0'

In [13]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 12283898799975946715
 xla_global_id: -1, name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 14415560704
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 8730480443657955857
 physical_device_desc: "device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5"
 xla_global_id: 416903419]

In [14]:
%%time
w2v_model.train(documents, total_examples=len(documents), epochs=W2V_EPOCH)

CPU times: user 36min 35s, sys: 11.1 s, total: 36min 46s
Wall time: 19min 58s


(488066199, 663205824)

In [15]:
tokenizer = Tokenizer() 
tokenizer.fit_on_texts(data['SentimentText'].values) 
X = tokenizer.texts_to_sequences(data['SentimentText'].values) 
X = pad_sequences(X)

In [16]:
vocab_size = len(tokenizer.word_index)+1
print('Vocab Size is ',vocab_size)

Vocab Size is  820831


In [18]:

embedding_matrix = np.zeros(( vocab_size, W2V_SIZE))
for word , i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]
print(embedding_matrix.shape)

(820831, 300)


In [19]:
from keras.models import Sequential
from keras.layers import SpatialDropout1D
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

lstm_out = 53
model = Sequential()
model.add(Embedding(vocab_size, W2V_SIZE, weights = [ embedding_matrix], input_length = X.shape[1], trainable = False))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(2*lstm_out, dropout=0.2, return_sequences=True))
model.add(LSTM(lstm_out, dropout=0.2))
model.add(Dense(2,activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
callbacks = [ ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
              EarlyStopping(monitor='val_acc', min_delta=1e-4, patience=5)]
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 41, 300)           246249300 
                                                                 
 spatial_dropout1d (SpatialD  (None, 41, 300)          0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 41, 106)           172568    
                                                                 
 lstm_1 (LSTM)               (None, 53)                33920     
                                                                 
 dense (Dense)               (None, 2)                 108       
                                                                 
Total params: 246,455,896
Trainable params: 206,596
Non-trainable params: 246,249,300
____________________________________

In [21]:
from sklearn.model_selection import train_test_split
Y = pd.get_dummies(data['Sentiment']).values
X_new, X_del, Y_new, Y_del = train_test_split(X,Y, test_size = 0.3, random_state = 42)
X_train,X_test,Y_train, Y_test =train_test_split(X_new,Y_new, test_size = 0.3, random_state =14 )
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(773519, 41) (773519, 2)
(331509, 41) (331509, 2)


In [22]:
batch_size = 256
epochs = 2
model.fit(X_train, Y_train, batch_size = batch_size, epochs = epochs, verbose=1)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fb77a65cb20>

In [23]:
score,acc = model.evaluate(X_test, Y_test, verbose = 1, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

score: 0.38
acc: 0.83


In [24]:
model2 = Sequential((Dense(2,activation='sigmoid')))
model2.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
# print(model2.summary())

In [25]:
batch_size = 512
epochs = 7
model2.fit(X_train, Y_train, batch_size = batch_size, epochs = epochs, verbose=1)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.History at 0x7fb86b76e4f0>

In [26]:
score,acc = model2.evaluate(X_test, Y_test, verbose = 1, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

score: 18.61
acc: 0.52


In [72]:
class LogReg:
    """
    Class to represent a logistic regression model.
    """

    def __init__(self, l_rate, epochs, n_features):
        """
        Create a new model with certain parameters.

        :param l_rate: Initial learning rate for model.
        :param epoch: Number of epochs to train for.
        :param n_features: Number of features.
        """
        self.l_rate = l_rate
        self.epochs = epochs
        self.coef = [0.0] * n_features
        self.bias = 0.0

    def sigmoid(self, score, threshold=20.0):
        """
        Prevent overflow of exp by capping activation at 20.

        :param score: A real valued number to convert into a number between 0 and 1
        """
        # print(score)
        # print(abs(score))
        # print(threshold)
        # print(abs(score)>threshold)
        if (abs(score) > threshold):
            score = threshold * sign(score)
        activation = exp(score)
        return activation / (1.0 + activation)

    def predict(self, features):
        """
        Given an example's features and the coefficients, predicts the class.

        :param features: List of real valued features for a single training example.

        :return: Returns the predicted class (either 0 or 1).
        """
        # print(features)
        # print(self.coef)
        # print(self.bias)
        value = sum([features[i]*self.coef[i] for i in range(len(features))]) + self.bias
        #print(value)
        return self.sigmoid(value)

    def sg_update(self, features, label):
        """
        Computes the update to the weights based on a predicted example.

        :param features: Features to train on.
        :param label: Corresponding label for features.
        """
        yhat = self.predict(features)
        e = label - yhat
        self.bias = self.bias + self.l_rate * e * yhat * (1-yhat)
        for i in range(len(features)):
          # if (i==0) : print(e,label,yhat)
          self.coef[i] = (self.coef[i]) + (self.l_rate * e * yhat * (1-yhat) * features[i])
          # if (i==0) : print(self.coef[0])
        return

    def train(self, X, y):
        """
        Computes logistic regression coefficients using stochastic gradient descent.

        :param X: Features to train on.
        :param y: Corresponding label for each set of features.

        :return: Returns a list of model weight coefficients where coef[0] is the bias.
        """
        for epoch in range(self.epochs):
            print(epoch)
            for features, label in zip(X, y):
                self.sg_update(features, label)
        return self.bias, self.coef

In [38]:
def get_accuracy(y_bar, y_pred):
    """
    Computes what percent of the total testing data the model classified correctly.

    :param y_bar: List of ground truth classes for each example.
    :param y_pred: List of model predicted class for each example.

    :return: Returns a real number between 0 and 1 for the model accuracy.
    """
    correct = 0
    for i in range(len(y_bar)):
        if y_bar[i] == y_pred[i]:
            correct += 1
    accuracy = (correct / len(y_bar)) * 100.0
    return accuracy

In [69]:
features1 = X_train[0]
print(sum([features1[i] for i in range(len(features1))]))
values = []
for yu in Y_train :
  val =2*yu[0] + yu[1]
  if val not in values:
    values.append(val)
print(values)

66854
[1, 2]


In [73]:
# Logistic Model
from math import exp
from numpy import sign
LEARNING_RATE = 0.1
EPOCHS= 10
logreg = LogReg(LEARNING_RATE, EPOCHS, len(X_train[0]))
bias_logreg, weights_logreg = logreg.train(X_train, Y_train[:,0])
y_logistic = [round(logreg.predict(example)) for example in Y_test]

# Compare accuracies
accuracy_logistic = get_accuracy(y_logistic, Y_test[:,0])
print('Logistic Regression Accuracy: {:0.3f}'.format(accuracy_logistic))

0
1
2
3
4
5
6
7
8
9
Logistic Regression Accuracy: 50.044


In [77]:
import pickle
with open('drive/MyDrive/lstm_model', 'wb') as picklefile:
    pickle.dump(model,picklefile)



In [75]:
import pickle
with open('linear_model', 'wb') as picklefile:
    pickle.dump(model2,picklefile)

In [76]:
print(bias_logreg,weights_logreg)

-0.024538496135856923 [0.0, 0.0, -5.522860497728673e-17, -4.507503836992401e-15, -1.1564020211398037e-14, -3.9438746814280475e-13, 2.2077016292431663e-05, 0.00019435812850934314, 0.0048836384107786065, 0.012021993157683432, 0.03245119341177947, 0.1005207748376716, 0.20376911579444526, 0.44439708454755306, 0.7726877952584621, 1.2211557660842074, 390.16433560895405, -7430.4351085862345, 18.40399834222648, 2.1153038502163395, 1.8718687046016873, -0.5330140973605441, 3.0835451788008785, 34.82985676329183, -0.8317367157702044, -8330.414863373106, -1.9525043643131057, -489.40509383174884, -22.548532228592016, 27.17408721116987, 4.462190492484859, -273.1285287492319, -152.40843124726487, -98.69090426873194, -379.53653401765683, -2167.9197595616633, -37.30633060181296, -38.99135168414841, -33.26552020102096, -39.36143439545421, 62.06236692692242]


In [None]:
# -0.024538496135856923 [0.0, 0.0, -5.522860497728673e-17, -4.507503836992401e-15, -1.1564020211398037e-14, -3.9438746814280475e-13, 2.2077016292431663e-05, 0.00019435812850934314, 0.0048836384107786065, 0.012021993157683432, 0.03245119341177947, 0.1005207748376716, 0.20376911579444526, 0.44439708454755306, 0.7726877952584621, 1.2211557660842074, 390.16433560895405, -7430.4351085862345, 18.40399834222648, 2.1153038502163395, 1.8718687046016873, -0.5330140973605441, 3.0835451788008785, 34.82985676329183, -0.8317367157702044, -8330.414863373106, -1.9525043643131057, -489.40509383174884, -22.548532228592016, 27.17408721116987, 4.462190492484859, -273.1285287492319, -152.40843124726487, -98.69090426873194, -379.53653401765683, -2167.9197595616633, -37.30633060181296, -38.99135168414841, -33.26552020102096, -39.36143439545421, 62.06236692692242]