# Predicting Depression on other datasets

### Imports

In [1]:
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
import pickle
from nltk.corpus import stopwords
from nltk import PorterStemmer
import warnings
warnings.filterwarnings("ignore")
import ftfy
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import re
import collections

nltk.download('stopwords')
nltk.download('punkt')

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\samarth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\samarth\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Load Trained model

In [2]:
model = load_model('Models/depression_keywords_model.h5')
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 140, 300)          6000000   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 140, 32)           28832     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 70, 32)            0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 70, 32)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 300)               399600    
_______________________________________

### Constants

In [3]:
# Reproducibility
np.random.seed(1234)

DEPRES_NROWS = 100000  # number of rows to read from DEPRESSIVE_TWEETS_CSV
RANDOM_NROWS = 100000  # number of rows to read from RANDOM_TWEETS_CSV
MAX_SEQUENCE_LENGTH = 140 # Max tweet size
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 300
TRAIN_SPLIT = 0.6
TEST_SPLIT = 0.2
LEARNING_RATE = 0.1
EPOCHS= 10

## Data Cleaning and Pre-Processing

In [4]:
# Expand Contraction
cList = {
  "ain't": "am not",
  "aren't": "are not",
  "can't": "cannot",
  "can't've": "cannot have",
  "'cause": "because",
  "could've": "could have",
  "couldn't": "could not",
  "couldn't've": "could not have",
  "didn't": "did not",
  "doesn't": "does not",
  "don't": "do not",
  "hadn't": "had not",
  "hadn't've": "had not have",
  "hasn't": "has not",
  "haven't": "have not",
  "he'd": "he would",
  "he'd've": "he would have",
  "he'll": "he will",
  "he'll've": "he will have",
  "he's": "he is",
  "how'd": "how did",
  "how'd'y": "how do you",
  "how'll": "how will",
  "how's": "how is",
  "I'd": "I would",
  "I'd've": "I would have",
  "I'll": "I will",
  "I'll've": "I will have",
  "I'm": "I am",
  "I've": "I have",
  "isn't": "is not",
  "it'd": "it had",
  "it'd've": "it would have",
  "it'll": "it will",
  "it'll've": "it will have",
  "it's": "it is",
  "let's": "let us",
  "ma'am": "madam",
  "mayn't": "may not",
  "might've": "might have",
  "mightn't": "might not",
  "mightn't've": "might not have",
  "must've": "must have",
  "mustn't": "must not",
  "mustn't've": "must not have",
  "needn't": "need not",
  "needn't've": "need not have",
  "o'clock": "of the clock",
  "oughtn't": "ought not",
  "oughtn't've": "ought not have",
  "shan't": "shall not",
  "sha'n't": "shall not",
  "shan't've": "shall not have",
  "she'd": "she would",
  "she'd've": "she would have",
  "she'll": "she will",
  "she'll've": "she will have",
  "she's": "she is",
  "should've": "should have",
  "shouldn't": "should not",
  "shouldn't've": "should not have",
  "so've": "so have",
  "so's": "so is",
  "that'd": "that would",
  "that'd've": "that would have",
  "that's": "that is",
  "there'd": "there had",
  "there'd've": "there would have",
  "there's": "there is",
  "they'd": "they would",
  "they'd've": "they would have",
  "they'll": "they will",
  "they'll've": "they will have",
  "they're": "they are",
  "they've": "they have",
  "to've": "to have",
  "wasn't": "was not",
  "we'd": "we had",
  "we'd've": "we would have",
  "we'll": "we will",
  "we'll've": "we will have",
  "we're": "we are",
  "we've": "we have",
  "weren't": "were not",
  "what'll": "what will",
  "what'll've": "what will have",
  "what're": "what are",
  "what's": "what is",
  "what've": "what have",
  "when's": "when is",
  "when've": "when have",
  "where'd": "where did",
  "where's": "where is",
  "where've": "where have",
  "who'll": "who will",
  "who'll've": "who will have",
  "who's": "who is",
  "who've": "who have",
  "why's": "why is",
  "why've": "why have",
  "will've": "will have",
  "won't": "will not",
  "won't've": "will not have",
  "would've": "would have",
  "wouldn't": "would not",
  "wouldn't've": "would not have",
  "y'all": "you all",
  "y'alls": "you alls",
  "y'all'd": "you all would",
  "y'all'd've": "you all would have",
  "y'all're": "you all are",
  "y'all've": "you all have",
  "you'd": "you had",
  "you'd've": "you would have",
  "you'll": "you you will",
  "you'll've": "you you will have",
  "you're": "you are",
  "you've": "you have"
}

c_re = re.compile('(%s)' % '|'.join(cList.keys()))

def expandContractions(text, c_re=c_re):
    def replace(match):
        return cList[match.group(0)]
    return c_re.sub(replace, text)

In [5]:
def clean_tweets(tweets):
    cleaned_tweets = []
    for tweet in tweets:
        tweet = str(tweet)
        # if url links then dont append to avoid news articles
        # also check tweet length, save those > 10 (length of word "depression")
        if re.match("(\w+:\/\/\S+)", tweet) == None and len(tweet) > 10:
            #remove hashtag, @mention, emoji and image URLs
            tweet = ' '.join(re.sub("(@[A-Za-z0-9]+)|(\#[A-Za-z0-9]+)|(<Emoji:.*>)|(pic\.twitter\.com\/.*)", " ", tweet).split())
            
            #fix weirdly encoded texts
            tweet = ftfy.fix_text(tweet)
            
            #expand contraction
            tweet = expandContractions(tweet)

            #remove punctuation
            tweet = ' '.join(re.sub("([^0-9A-Za-z \t])", " ", tweet).split())

            #stop words
            stop_words = set(stopwords.words('english'))
            word_tokens = nltk.word_tokenize(tweet) 
            filtered_sentence = [w for w in word_tokens if not w in stop_words]
            tweet = ' '.join(filtered_sentence)

            #stemming words
            tweet = PorterStemmer().stem(tweet)
            
            cleaned_tweets.append(tweet)

    return cleaned_tweets

In [6]:
# loading tokenizer
with open('Tokenizers/tokenizer_keywords.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [7]:
def preprocess_tweets(tweet_list):
    cleaned_list = clean_tweets(tweet_list)
    sequence = tokenizer.texts_to_sequences(cleaned_list)
    data = pad_sequences(sequence, maxlen=MAX_SEQUENCE_LENGTH)
    return data

## COVID-19 Tweets

### Loading Dataset

In [9]:
COVID_TWEETS_CSV = "Datasets/covid_keywords.csv"
covid_tweets_df = pd.read_csv(COVID_TWEETS_CSV, usecols = ["tweet"])

In [10]:
covid_tweets_df.head()

Unnamed: 0,tweet
0,@MrNickKnowles @MrGerryCampbell @policecommand...
1,Health workers at the Uitenhage Provincial hos...
2,Covid-19: Congress releases documentary on Rah...
3,How to Keep Sales Going During COVID-19 Pandem...
4,"Being a ""True Supporter of the MAGA agenda"" is..."



### Cleaning and Pre-processing the Tweets

In [11]:
covid_tweets_arr = [x for x in covid_tweets_df["tweet"]]
data = preprocess_tweets(covid_tweets_arr)
print(data.shape)

(244198, 140)


### Predicting Depression in Tweets

In [12]:
# make a prediction
prediction = model.predict_classes(data)

occurrences_depression = np.count_nonzero(prediction == 1)
occurrences_no_dep = np.count_nonzero(prediction == 0)
precentage_of_dep = occurrences_depression/len(prediction) * 100

### Results

In [13]:
print("Total number of Tweets: {}".format(len(prediction)))
print("Number of Depressive Tweets: {}".format(occurrences_depression))
print("Number of Non-Depressive Tweets: {}".format(occurrences_no_dep))
print("Percentage of Depressive Tweets: {}".format(precentage_of_dep))

Total number of Tweets: 244198
Number of Depressive Tweets: 133109
Number of Non-Depressive Tweets: 111089
Percentage of Depressive Tweets: 54.50863643436883


## Random Tweets

## Loading Dataset

In [14]:
RANDOM_TWEETS_CSV = 'Datasets/Sentiment Analysis Dataset 2.csv'
random_tweets_df = pd.read_csv(RANDOM_TWEETS_CSV, encoding = "ISO-8859-1", usecols = range(0,4), skiprows = range(1, 150000) ,nrows = 400000)

In [15]:
random_tweets_df.head()

Unnamed: 0,ï»¿ItemID,Sentiment,SentimentSource,SentimentText
0,150011,0,Sentiment140,@ESoPINK Oh you have swine flu! You poor baby!...
1,150012,1,Sentiment140,@esoteric_vae *nods* No favourites
2,150013,0,Sentiment140,@esoteric_vae I can beat you on that front.. w...
3,150014,0,Sentiment140,"@esoteric_vae Poor you Am thinking of you, al..."
4,150015,1,Sentiment140,"@esotericchords iyaaa, i haven't heard from yo..."


## Cleaning and Pre-processing the Tweets

In [16]:
random_tweets_arr = [x for x in random_tweets_df['SentimentText']]
data = preprocess_tweets(random_tweets_arr)
print(data.shape)

(399701, 140)


## Predicting Depression in Tweets

In [17]:
# make a prediction
prediction = model.predict_classes(data)

occurrences_depression = np.count_nonzero(prediction == 1)
occurrences_no_dep = np.count_nonzero(prediction == 0)
precentage_of_dep = occurrences_depression/len(prediction) * 100

## Results

In [18]:
print("Total number of Tweets: {}".format(len(prediction)))
print("Number of Depressive Tweets: {}".format(occurrences_depression))
print("Number of Non-Depressive Tweets: {}".format(occurrences_no_dep))
print("Percentage of Depressive Tweets: {}".format(precentage_of_dep))

Total number of Tweets: 399701
Number of Depressive Tweets: 5376
Number of Non-Depressive Tweets: 394325
Percentage of Depressive Tweets: 1.345005391530169
