<a href="https://colab.research.google.com/github/robitussin/mental-illness/blob/main/Multiclass_textclassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/tensorflow/examples/blob/master/courses/udacity_intro_to_tensorflow_for_deep_learning/l09c05_nlp_tweaking_the_model.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/tensorflow/examples/blob/master/courses/udacity_intro_to_tensorflow_for_deep_learning/l09c05_nlp_tweaking_the_model.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a>
  </td>
</table>

# Import libraries

In [1]:
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization

from sklearn.model_selection import train_test_split

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import os
import re
import string
import random
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Download dataset

Option 1: Download dataset from github repository

In [2]:
url = "https://github.com/robitussin/mental-illness/blob/4719afa5e9c628e8acd9f4f4d424de74a0481bdb/trainingset.csv?raw=true"

# Load training dataset
train_df = pd.read_csv(url)

In [3]:
url = "https://github.com/robitussin/mental-illness/blob/4719afa5e9c628e8acd9f4f4d424de74a0481bdb/validationset.csv?raw=true"

#Load validation dataset
val_df = pd.read_csv(url)

Option 2:
Get dataset directly from google drive

In [4]:
#from google.colab import drive
#drive.mount('/content/drive')

In [5]:
#train_df = pd.read_csv("/content/drive/My Drive/datasets/deeplearning/trainingset.csv")
#print(train_df.info())

In [6]:
#val_df = pd.read_csv("/content/drive/My Drive/datasets/deeplearning/validationset.csv")
#print(val_df.info())

# Explore Data

Merge train and test dataset

In [7]:
data = pd.concat([train_df,val_df])

Check the summary information of the dataset

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15215 entries, 0 to 1487
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ID          15215 non-null  object
 1   title       15215 non-null  object
 2   post        15215 non-null  object
 3   class_name  15215 non-null  object
 4   class_id    15215 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 713.2+ KB


Check the different types of labels in the dataset

In [9]:
data.class_name.value_counts()

adhd          2713
depression    2698
anxiety       2670
bipolar       2655
ptsd          2249
none          2230
Name: class_name, dtype: int64

Shuffle the data set

In [10]:
data = data.sample(frac=1)

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15215 entries, 7692 to 6818
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ID          15215 non-null  object
 1   title       15215 non-null  object
 2   post        15215 non-null  object
 3   class_name  15215 non-null  object
 4   class_id    15215 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 713.2+ KB


Check the first 5 rows of the data set

In [12]:
data.head()

Unnamed: 0,ID,title,post,class_name,class_id
7692,bc914729-3fe2-4b12-8959-52f6878dd448,"when i found ""driven by distraction"" in my pub...",i'm glad i'm not alone in this. you all make t...,adhd,0
1127,e2cac13e-dd9e-45bd-8d3d-bb8b1fba1797,everyone. just a reminder.,hide your posts so when you comment on somethi...,ptsd,4
12113,59e1d412-36cb-47b8-ba45-80ae84f1daa9,sometimes i wish i could get killed so people ...,sorry if this is the wrong place. i don't thin...,depression,3
10681,6973c417-a134-402f-bcf0-adf2ca8bbac5,active shooter,i don’t know exactly what to say. i responded ...,ptsd,4
8486,08bbe971-b3d0-4702-899a-5fbac7e2d296,up from a nightmare at 6 am *domestic violence...,sorry no one will probably read this but thank...,ptsd,4


Check all classes

In [13]:
data['class_name']

7692           adhd
1127           ptsd
12113    depression
10681          ptsd
8486           ptsd
            ...    
7386           adhd
3682        bipolar
3790     depression
4830           none
6818           none
Name: class_name, Length: 15215, dtype: object

# Prepare Data

Remove unnecessary columns in the data set

In [14]:
data = data.drop(columns=['ID', 'title'])

Remove rows with NULL values

In [15]:
data = data.dropna().reset_index(drop=True)

Build a dictionary for id to text label/category

In [16]:
id_to_category = pd.Series(data.class_name.values,index=data.class_id).to_dict()
id_to_category

{0: 'adhd', 1: 'anxiety', 2: 'bipolar', 3: 'depression', 4: 'ptsd', 5: 'none'}

Set limit for testing

In [17]:
#limit the number of samples to be used in testing the pipeline
#data_size= 12
#data = data[:data_size]
#data.info()

Split the Raw Dataset into Train and Test Datasets

In [18]:
features, targets = data['post'], data['class_id']

train_features, test_features, train_targets, test_targets = train_test_split(
        features, targets,
        train_size=0.8,
        test_size=0.2,
        random_state=42,
        shuffle = True,
        stratify=targets
    )

Display first 5 elements of posts

In [19]:
train_features.values[:5]

array(['i was wondering where i could find an hourly dataset of a climate station of a specific county. the ideal dataset will be **hourly**, and will have information like wind speed, wind direction, atmospheric pressure, humidity, temperature, cloud type, and rainfall. i know that there are climate stations that collect this data but i can only find monthly or daily datasets.',
       'so i had a normal childhood up until the point that my parents got divorced. this was in february of \'97 - i was only six years old during that event (born in the latter half of \'90). after the divorce is when things kind of went haywire. my mother and father were both alcoholics - they got divorced, yet stayed together (kind of weird? said it was to help me..okay..?) up until 2006 - when i was 15. having said that; here goes. during my childhood between 6 and 15 i experienced a lot of negative things. my father would get black-out drunk - when he did, i would get terrified (this was daily) because o

Display first 5 elements of labels

In [20]:
train_targets.values[:5]

array([5, 4, 1, 3, 0])

Convert the data stored in Pandas Data Frame into a data stored in TensorFlow Data Set

In [21]:
# train X & y
train_post_ds_raw = tf.data.Dataset.from_tensor_slices(
            tf.cast(train_features.values, tf.string)
) 
train_cat_ds_raw = tf.data.Dataset.from_tensor_slices(
            tf.cast(train_targets.values, tf.int64),

) 
# test X & y
test_post_ds_raw = tf.data.Dataset.from_tensor_slices(
            tf.cast(test_features.values, tf.string)
) 
test_cat_ds_raw = tf.data.Dataset.from_tensor_slices(
            tf.cast(test_targets.values, tf.int64),

) 

Set the dictionary size and the sequence length

In [22]:
vocab_size = 500  # Only consider the top 20K words
max_len = 250  # Maximum review (text) size in words
embedding_dim = 16

## Preprocess text

Create a function that will preprocess the text. It will perform the following:

* Convert all characters to lowercase

* Remove special symbols, extra spaces, html tags, digits, and punctuations

* Remove stop words

* Replace the special Turkish letters with the corresponding English letters.

In [23]:
stop_words = set(stopwords.words('english'))
#@tf.keras.utils.register_keras_serializable()
def custom_standardization(input_string):
    """ Remove html line-break tags and handle punctuation """
    no_uppercased = tf.strings.lower(input_string, encoding='utf-8')
    no_stars = tf.strings.regex_replace(no_uppercased, "\*", " ")
    no_repeats = tf.strings.regex_replace(no_stars, "devamını oku", "")    
    no_html = tf.strings.regex_replace(no_repeats, "<br />", "")
    no_digits = tf.strings.regex_replace(no_html, "\w*\d\w*","")
    no_punctuations = tf.strings.regex_replace(no_digits, f"([{string.punctuation}])", r" ")
    #remove stop words
    no_stop_words = ' '+no_punctuations+ ' '
    for each in stop_words:
      no_stop_words = tf.strings.regex_replace(no_stop_words, ' '+each[0]+' ' , r" ")
    no_extra_space = tf.strings.regex_replace(no_stop_words, " +"," ")

    return no_extra_space

Verify if the `custom_standardization` function is working 

In [24]:
input_string = "Bu Issız Öğlenleyin de;  şunu ***1 Pijamalı Hasta***, ve  Ancak İşte Yağız Şoföre Çabucak Güvendi...Devamını oku"
print("input:  ", input_string)
output_string= custom_standardization(input_string)
print("output: ", output_string.numpy().decode("utf-8"))

input:   Bu Issız Öğlenleyin de;  şunu ***1 Pijamalı Hasta***, ve  Ancak İşte Yağız Şoföre Çabucak Güvendi...Devamını oku
output:   bu issız öğlenleyin de şunu pijamalı hasta ve ancak i̇şte yağız şoföre çabucak güvendi 


## Tokenize/Vectorize words
Tokenize words using the Keras `textVectorization()` function

In [25]:
vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=max_len
)

* Tokenize/Adapt text vectorization layer with the training data set
* Create a vocabulary of words

In [26]:
vectorize_layer.adapt(train_features)
vocab = vectorize_layer.get_vocabulary()  # To get words back from token indices

Display sample tokenization/vectorization

In [27]:
print("vocab has the ", len(vocab)," entries")
print("vocab has the following first 10 entries")
for word in range(10):
  print(word, " represents the word: ", vocab[word])

for X in train_features[:1]:
  print(" Given raw data: " )
  print(X)
  tokenized = vectorize_layer(tf.expand_dims(X, -1))
  print(" Tokenized and Transformed to a vector of integers: " )
  print (tokenized)
  print(" Text after Tokenized and Transformed: ")
  transformed = ""
  for each in tf.squeeze(tokenized):
    transformed= transformed+ " "+ vocab[each]
  print(transformed)

vocab has the  500  entries
vocab has the following first 10 entries
0  represents the word:  
1  represents the word:  [UNK]
2  represents the word:  to
3  represents the word:  and
4  represents the word:  the
5  represents the word:  my
6  represents the word:  of
7  represents the word:  it
8  represents the word:  that
9  represents the word:  in
 Given raw data: 
i was wondering where i could find an hourly dataset of a climate station of a specific county. the ideal dataset will be **hourly**, and will have information like wind speed, wind direction, atmospheric pressure, humidity, temperature, cloud type, and rainfall. i know that there are climate stations that collect this data but i can only find monthly or daily datasets.
 Tokenized and Transformed to a vector of integers: 
tf.Tensor(
[[ 16   1 118  96 170  54   1   1   6   1   1   6   1   1   4   1   1  68
   23   1   3  68  17   1  22   1   1   1   1   1   1   1   1   1   1   3
    1  43   8  60  34   1   1   8   1  14 4

In [28]:
vocab[:5]

['', '[UNK]', 'to', 'and', 'the']

## Apply Keras Text Vectorization to the training and test data sets

Define a function `convert_text_input()` that apply text vectorization/tokenization to all posts

In [29]:
def convert_text_input(sample):
    text = sample
    text = tf.expand_dims(text, -1)  
    return tf.squeeze(vectorize_layer(text))

The tensor flow `map()` function was used to apply the `convert_text_input()` function on every `posts` of the training data set

Encode/tokenize raw text posts `train_post_ds_raw` and `test_post_ds`

In [30]:
# Train X
train_post_ds = train_post_ds_raw.map(convert_text_input, 
                                  num_parallel_calls=tf.data.experimental.AUTOTUNE)
# Test X
test_post_ds = test_post_ds_raw.map(convert_text_input, 
                                  num_parallel_calls=tf.data.experimental.AUTOTUNE)

Check three tokenized/vectorized sample posts

In [31]:
for each in train_post_ds.take(3):
  print(each)

tf.Tensor(
[ 16   1 118  96 170  54   1   1   6   1   1   6   1   1   4   1   1  68
  23   1   3  68  17   1  22   1   1   1   1   1   1   1   1   1   1   3
   1  43   8  60  34   1   1   8   1  14 453  15  26  92 170   1  25   1
   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0

## Finalize the training data set 


Merge the preprocessed and encoded posts `train_post_ds` with the encoded categories `train_cat_ds_raw`

In [32]:
train_ds = tf.data.Dataset.zip(
    (
            train_post_ds,
            train_cat_ds_raw
     )
) 

Merge the preprocessed and encoded posts `test_post_ds` with the encoded categories `test_cat_ds_raw`

In [33]:
test_ds = tf.data.Dataset.zip(
    (
            test_post_ds,
            test_cat_ds_raw
     )
) 

Display one post from `train_ds`

In [34]:
for X,y in train_ds.take(1):
  print("input (review) X.shape: ", X.shape)
  print("output (category) y.shape: ", y.shape)
  print("input (review) X: ", X)
  print("output (category) y: ",y)
  input = " ".join([vocab[_] for _ in np.squeeze(X)])
  output = id_to_category[y.numpy()]
  print("X: input (review) in text: " , input)
  print("y: output (category) in text: " , output)

input (review) X.shape:  (250,)
output (category) y.shape:  ()
input (review) X:  tf.Tensor(
[ 16   1 118  96 170  54   1   1   6   1   1   6   1   1   4   1   1  68
  23   1   3  68  17   1  22   1   1   1   1   1   1   1   1   1   1   3
   1  43   8  60  34   1   1   8   1  14 453  15  26  92 170   1  25   1
   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   

## Finalize tensorflow data pipeline

* Set batch size
* Shuffle data set
* Optimize

In [35]:
batch_size = 64
AUTOTUNE = tf.data.experimental.AUTOTUNE
buffer_size= train_ds.cardinality().numpy()

train_ds = train_ds.shuffle(buffer_size=buffer_size)\
                   .batch(batch_size=batch_size,drop_remainder=True)\
                   .cache()\
                   .prefetch(AUTOTUNE)

test_ds = test_ds.shuffle(buffer_size=buffer_size)\
                   .batch(batch_size=batch_size,drop_remainder=True)\
                   .cache()\
                   .prefetch(AUTOTUNE)

In [36]:
train_ds.element_spec

(TensorSpec(shape=<unknown>, dtype=tf.int64, name=None),
 TensorSpec(shape=(64,), dtype=tf.int64, name=None))

# Create classification model

## Word Embedding only

In [51]:
model = tf.keras.Sequential([
    layers.Embedding(len(vocab), 64, input_length=max_len),
    tf.keras.layers.GlobalAveragePooling1D(),
    layers.Dropout(0.5),
    layers.Dense(64, activation='relu'),
    layers.Dense(6)
])

model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['SparseCategoricalAccuracy'])

history = model.fit(train_ds, epochs=30, validation_data=test_ds)

loss, accuracy = model.evaluate(test_ds)
print("Train accuracy1: ", accuracy)
model.summary()


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Train accuracy1:  0.7114361524581909
Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_10 (Embedding)    (None, 250, 64)           32000     
                                                                 
 global_average_pooling1d_4   (None, 64)               0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dropout_6 (Dropout)         (None, 64)                0         
                                                             

In [38]:
"""def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()
  
plot_graphs(history, "sparse_categorical_accuracy")
plot_graphs(history, "loss")"""

'def plot_graphs(history, string):\n  plt.plot(history.history[string])\n  plt.plot(history.history[\'val_\'+string])\n  plt.xlabel("Epochs")\n  plt.ylabel(string)\n  plt.legend([string, \'val_\'+string])\n  plt.show()\n  \nplot_graphs(history, "sparse_categorical_accuracy")\nplot_graphs(history, "loss")'

In [39]:
raw_data=['i hate myself',
          'i do not like myself']
predictions=end_to_end_model.predict(raw_data)
print(id_to_category[np.argmax(predictions[0])])
print(id_to_category[np.argmax(predictions[1])])

NameError: ignored

## Bidirectional LSTMs

In [44]:
epochs = 10
embedding_dim = 64
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(vocab), embedding_dim, input_length=max_len),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(6)
])

learning_rate = 1e-4

model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(learning_rate),
              metrics=['SparseCategoricalAccuracy'])

history = model.fit(train_ds, epochs=epochs, validation_data=test_ds)

loss, accuracy = model.evaluate(test_ds)
print("Train accuracy: ", accuracy)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train accuracy:  0.39993351697921753


In [45]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(vocab), 64, mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(6)
])


learning_rate = 1e-4

model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(learning_rate),
              metrics=['SparseCategoricalAccuracy'])

history = model.fit(train_ds, epochs=epochs, validation_data=test_ds)

loss, accuracy = model.evaluate(test_ds)
print("Train accuracy: ", accuracy)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train accuracy:  0.49368351697921753


## CNN

In [None]:
epochs = 30
embedding_dim = 16

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_len),
    tf.keras.layers.Conv1D(16, 5, activation='relu'),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(6, activation='sigmoid')
])

learning_rate = 0.0001

model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(learning_rate),
              metrics=['SparseCategoricalAccuracy'])

history = model.fit(train_ds, verbose=1, epochs=epochs)

loss, accuracy = model.evaluate(test_ds)
print("Train accuracy: ", accuracy)

model = tf.keras.Sequential([
  keras.Input(shape=(1,), dtype="string"),
  vectorize_layer,
  model,
])

model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False), optimizer="adam", metrics=['accuracy']
)

loss, accuracy = model.evaluate(test_features, test_targets)
print("Train accuracy: ", accuracy)

## GRU

In [None]:
num_epochs = 30

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_len),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32)),
    tf.keras.layers.Dense(6, activation='sigmoid')
])

learning_rate = 0.00003

model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(learning_rate),
              metrics=['SparseCategoricalAccuracy'])

history = model.fit(train_ds, verbose=1, epochs=epochs)

loss, accuracy = model.evaluate(test_ds)
print("Train accuracy: ", accuracy)

model = tf.keras.Sequential([
  keras.Input(shape=(1,), dtype="string"),
  vectorize_layer,
  model,
])

model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False), optimizer="adam", metrics=['accuracy']
)

loss, accuracy = model.evaluate(test_features, test_targets)
print("Train accuracy: ", accuracy)