In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional

In [None]:
train = pd.read_csv('/kaggle/input/depression-and-anxiety-in-twitter-id/datd_train.csv')
test = pd.read_csv('/kaggle/input/depression-and-anxiety-in-twitter-id/datd_test.csv')
rand = pd.read_csv('/kaggle/input/depression-and-anxiety-in-twitter-id/datd_rand.csv')

# Text preprocssing

In [None]:
import re

def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"
        u"\U0001F700-\U0001F77F"  # alchemical symbols
        u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        u"\U0001FA00-\U0001FA6F"  # Chess Symbols
        u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        u"\U00002702-\U000027B0"  # Dingbats
        u"\U000024C2-\U0001F251"# flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

def process(text):
    # remove links
    text = re.sub("https://t.co/\S*","",text) 

    # remove newline
    text = re.sub("\n","",text)

    # remove digits
    text = re.sub("[0-9]","",text)

    # remove emojis
    text = deEmojify(text)

    return text

In [None]:
train_corpus = list(map(process, train['text']))    
print(len(train_corpus))
print(train_corpus[1])

test_corpus = list(map(process, test['text']))    
print(len(test_corpus))
print(test_corpus[1])

X_train = train_corpus
y_train = train['label'].tolist()

X_test = test_corpus
y_test = test['label'].tolist()

X_rand = list(map(process, rand['text']))
y_rand = rand['label'].tolist()

print(len(X_train[0]))
print(X_train[0])
print(len(X_train))
print(len(y_train))

In [None]:
VOCAB_SIZE = 4000

encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(max_tokens=VOCAB_SIZE)
encoder.adapt(X_train)

In [None]:
model_bilstm = tf.keras.Sequential([
    encoder,
    Embedding(input_dim=len(encoder.get_vocabulary()), output_dim=100, mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model_bilstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', tf.keras.metrics.Recall(name='recall', thresholds=0.9)])
history = model_bilstm.fit(train['text'], train['label'], validation_data=(test['text'], test['label']), epochs=7, batch_size=128)

In [None]:
def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

def evaluate(y_predict, y_actual, threshold):
    tp = 0
    tn = 0
    fn = 0
    fp = 0
    for k, v in enumerate(y_predict):
        predict = v > threshold
        actual = y_actual[k]

        if actual and not predict:
          fn += 1

        if actual and predict:
          tp += 1

        if not actual and predict:
          fp += 1

        if not actual and not predict:
          tn += 1

    accuracy = (tp+tn)/len(y_predict)
    precision = tp/(tp+fp)
    recall = tp/(tp+fn)
    f1 = (2*tp)/(2*tp + fp+ fn)
    #print('accuracy:', accuracy)
    #print('precision:', precision)
    #print('recall:', recall)
    #print('f1:', f1)
    return accuracy, precision, recall, f1

def plot_metrics(pred, actual):
    X = np.arange(0.5, 1.0, 0.005)
    A = []
    P = []
    R = []
    F = []

    for x in X:
        a, p, r, f = evaluate(pred, actual, x)
        A.append(a)
        P.append(p)
        R.append(r)
        F.append(f)

    plt.plot(X, A)
    plt.plot(X, P)
    plt.plot(X, R)
    plt.plot(X, F)
    plt.xlabel("threshold")
    plt.legend(['accuracy', 'precision', 'recall', 'F1'])
    plt.show()

    best = F.index(max(F))
    print("Accuracy: {:.3g}\nPrecision: {:.3g}\nRecall: {:.3g}\nF1 score: {:.3g}\nTreshold: {:.5g}".format(A[best], P[best], R[best], F[best], X[best]))

In [None]:
plot_graphs(history, 'accuracy')
plot_graphs(history, 'recall')
plot_graphs(history, 'loss')

In [None]:
plot_metrics(model_bilstm.predict(X_rand), y_rand)