In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'depression-anxiety-stress-scales-responses:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F691971%2F1211992%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240611%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240611T070759Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D3e22fa89661efb90268c5c5ebc870bff8682e5b46804e2fa0b4872e1b7a46f396732f46172032f4b4d6ef87d9fc4b39317d2f0fc8ff15e67067d8395d4e3d87cc3b60a156b4c70b7703d50611f03b691811cbd1b7132f659519ba808a5a8bb3c48fd29aa2857e5d387d5076edd956d02b9fed648988c6cf68486141bcb75a4bc8efcda1a52bf688852d3e1528c453bfab38053a7b6e4474c72b835d0271d951e98e5fe328db120663aecd8d2a9486378bf55a663db424cf0d599ac17de14822e6d11c044b1ec99c3a64e61a463eafa1534f9f1ea9614d30f60631ff85a5623e90e5376c3201b61342086c78c99054ff7768446ec9251f348caa091f4cb2a391d,mental-health-corpus:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F2782228%2F4805127%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240611%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240611T070800Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D2b9fafd06be629b7d59d1ad6dce32e2a72f57bc8646aaadfc48f6bc0e705769013b03fb7417e04c39686b22176f4cdab084c5011d4298b0ee1baa00ac603a9c4bf6f419a02990d454a667732c21846ada7a46485b74848360ecde149340f5a661bef47f0580c78a4d5e04039f60565a8f2bec0847f49eacea25a6947f40b138cb8786599603066969992e1197ab06e89eed3861578b7e06efa042d074afe2dec63b91814bded520b14f9955b0b47e4fc0c0c27ea8b17c90881e965d91ac98541d74214c6772c026e8f09ee37781432989f588239631ffb575dc5b9e6b6bfa49e50efe9ca1559f9687ddc8069b62bd9d71b67433a2d32374a6babb59f3cc5e55d'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Stress Prediction**

In [None]:
!wget https://raw.githubusercontent.com/mrdbourke/tensorflow-deep-learning/main/extras/helper_functions.py
from helper_functions import unzip_data, create_tensorboard_callback, plot_loss_curves, compare_historys, walk_through_dir

In [None]:
import pandas as pd

# Load CSV file into a DataFrame

df = pd.read_csv('/kaggle/input/mental-health-corpus/mental_health.csv')

# Display the DataFrame
df.head(20)

In [None]:
df.info()

In [None]:
import matplotlib.pyplot as plt

# Calculate the value counts of the 'category' column
category_counts = df['label'].value_counts()

# Bar chart
plt.figure(figsize=(6, 4))
category_counts.plot(kind='bar')
plt.xlabel('Label')
plt.ylabel('Counts')
plt.title('Bar Chart of Counts')
plt.show()
print()

# Pie chart
plt.figure(figsize=(6, 4))
category_counts.plot(kind='pie', autopct='%1.1f%%')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle
plt.title('Pie Chart of Distribution')

# Add legend
plt.legend()

plt.show()

In [None]:
print('There are', df.shape[0], 'data in this dataset')

# Do we have duplicates?
print('Number of Duplicates:', len(df[df.duplicated()]))

# Do we have missing values?
missing_values = df.isnull().sum()
print('Number of Missing Values by column:\n',missing_values)

print('Number of Missing Values:', df.isnull().sum().sum())

In [None]:
df.replace("", np.nan, inplace=True)
missing_values = df.isnull().sum()
print('Number of Missing Values and Empty Spaces by column:\n',missing_values)

In [None]:
duplicate_rows = df[df.duplicated(keep=False)]

# Then sort the dataframe on all columns to ensure duplicates are adjacent
sorted_duplicates = duplicate_rows.sort_values(by=list(duplicate_rows.columns))

# Now, if we want to see 5 pairs of duplicates (10 rows), we can simply:
top_5_duplicate_pairs = sorted_duplicates.head(20)

top_5_duplicate_pairs

In [None]:
df = df.drop_duplicates()
print('Number of Duplicates:', len(df[df.duplicated()]))

In [None]:
df = df.dropna()
print('Number of Missing Values:', df.isnull().sum().sum())

In [None]:
df.info()

In [None]:
def random_sample_reviews(df, num_samples):
    # Use groupby on 'Rating' and then apply the sample function to 'Review_Text' of each group
    samples = df.groupby('label')['text'].apply(lambda x: x.sample(num_samples))

    # Convert series to dataframe and reset index
    # samples_df = samples.reset_index()
    samples_df = samples.reset_index().drop(columns='level_1')

    return samples_df
pd.set_option('display.max_colwidth', 200) # This will display up to 100 characters
samples = random_sample_reviews(df, num_samples=3)
samples.head(20)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

import re
import string

from tensorflow import keras
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.layers import SimpleRNN, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
def strip_emoji(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese characters
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

#Remove punctuations, links, mentions and \r\n new line characters
def strip_all_entities(text):
    text = text.replace('\r', '').replace('\n', ' ').replace('\n', ' ').lower() #remove \n and \r and lowercase
    text = re.sub(r"(?:\@|https?\://)\S+", "", text) #remove links and mentions
    text = re.sub(r'[^\x00-\x7f]',r'', text) #remove non utf8/ascii characters such as '\x9a\x91\x97\x9a\x97'
    banned_list= string.punctuation + 'Ã'+'±'+'ã'+'¼'+'â'+'»'+'§'
    table = str.maketrans('', '', banned_list)
    text = text.translate(table)
    return text

#clean hashtags at the end of the sentence, and keep those in the middle of the sentence by removing just the # symbol
def clean_hashtags(text):
    new_text = " ".join(word.strip() for word in re.split('#(?!(?:hashtag)\b)[\w-]+(?=(?:\s+#[\w-]+)*\s*$)', text)) #remove last hashtags
    new_text2 = " ".join(word.strip() for word in re.split('#|_', new_text)) #remove hashtags symbol from words in the middle of the sentence
    return new_text2

#Filter special characters such as & and $ present in some words
def filter_chars(a):
    sent = []
    for word in a.split(' '):
        if ('$' in word) | ('&' in word):
            sent.append('')
        else:
            sent.append(word)
    return ' '.join(sent)

def remove_mult_spaces(text): # remove multiple spaces
    return re.sub("\s\s+" , " ", text)


In [None]:
df['text1'] = (df['text']
                     .apply(strip_emoji)
                     .apply(strip_all_entities)
                     .apply(clean_hashtags)
                     .apply(filter_chars)
                     .apply(remove_mult_spaces))

In [None]:
df.head()

In [None]:
df_comparison = pd.DataFrame()

# Original text and its length
df_comparison['pre-clean text'] = df['text']
df_comparison['pre-clean len'] = df['text'].apply(lambda x: len(str(x).split()))

# Cleaned text and its length
df_comparison['post-clean text'] = df['text1']
df_comparison['post-clean len'] = df['text1'].apply(lambda x: len(str(x).split()))

df_comparison.head(20)

In [None]:
def remove_stopwords(sentence):
    """
    Removes a list of stopwords

    Args:
        sentence (string): sentence to remove the stopwords from

    Returns:
        sentence (string): lowercase sentence without the stopwords
    """
    # List of stopwords
    stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

    # Sentence converted to lowercase-only
    sentence = sentence.lower()

    words = sentence.split()
    no_words = [w for w in words if w not in stopwords]
    sentence = " ".join(no_words)

    return sentence

In [None]:
df['text2'] = (df['text1'].apply(remove_stopwords))

In [None]:
df_comp = pd.DataFrame()

# Original text and its length
df_comp['pre-clean text'] = df['text1']
df_comp['pre-clean len'] = df['text1'].apply(lambda x: len(str(x).split()))

# Cleaned text and its length
df_comp['post-clean text'] = df['text2']
df_comp['post-clean len'] = df['text2'].apply(lambda x: len(str(x).split()))

df_comp.head(20)

In [None]:
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

import nltk
nltk.download('wordnet')
nltk.download('punkt')

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    # Tokenize the sentence
    word_list = nltk.word_tokenize(text)

    # Lemmatize list of words and join
    lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in word_list])

    return lemmatized_output

In [None]:
df['text3'] = df['text2'].apply(lemmatize_text)

In [None]:
df_lemma = pd.DataFrame()

# Original text and its length
df_lemma['pre-clean text'] = df['text2']
df_lemma['pre-clean len'] = df['text2'].apply(lambda x: len(str(x).split()))

# Cleaned text and its length
df_lemma['post-clean text'] = df['text3']
df_lemma['post-clean len'] = df['text3'].apply(lambda x: len(str(x).split()))

df_lemma.head(20)

In [None]:
df['text_length'] = df['text3'].apply(lambda x: len(str(x).split()))

In [None]:
text_lengths = [len(text.split()) for text in df["text3"]]

# Find the 95th quartile
quartile_95 = np.percentile(text_lengths, 95)

print(f"95th Quartile of Text Lengths: {quartile_95}")

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(text_lengths, bins=20, edgecolor='black')
plt.xlabel('Word Length')
plt.ylabel('Frequency')
plt.title('Distribution of Text Lengths')

# Adding a vertical line for the 95th quartile
quartile_95 = np.percentile(text_lengths, 95)
plt.axvline(x=quartile_95, color='red', linestyle='--', label='95th Quartile')
plt.legend()

plt.grid(True)
plt.show()

In [None]:
df.text_length.describe()

In [None]:
import seaborn as sns

plt.figure(figsize=(7,5))
ax = sns.countplot(x='text_length', data=df[df['text_length']<10], palette='mako')
plt.title('Training text with less than 10 words')
plt.yticks([])
ax.bar_label(ax.containers[0])
plt.ylabel('Count')
plt.xlabel('')
plt.show()

In [None]:
data_head=df[df['text_length']<2]
data_head.head(30)

In [None]:
len(df)

In [None]:
df = df[df['text_length'] >= 3]

In [None]:
len(df)

In [None]:
df = df.drop(['text', 'text1', 'text2'], axis=1)

In [None]:
df = df.sample(frac=1, random_state=42) # shuffle with random_state=42 for reproducibility
df.head(30)

In [None]:
df.label.value_counts()

In [None]:
data = {
    'Label': ["Non-mental-health", "Mental-health"],
    'Label Encoded': [0,1]
}

# Create DataFrame
dr = pd.DataFrame(data)

# Print DataFrame
dr

In [None]:
class_names=dr.Label.to_list()
class_names

In [None]:
X = df['text3'].to_numpy()
y = df['label'].to_numpy()

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.10, stratify=y, random_state=42)

In [None]:
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

In [None]:
len(X_train), len(X_valid), len(y_train), len(y_valid)

In [None]:
X_train

In [None]:
y_train, y_valid

In [None]:
round(sum([len(i.split()) for i in X_train])/len(X_train))

In [None]:
text_lengths = [len(text.split()) for text in X_train]

# Find the 98th percentile
percentile_95 = np.percentile(text_lengths, 95)

print(f"95th Percentile of Text Lengths: {percentile_95}")

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(text_lengths, bins=20, edgecolor='black')
plt.xlabel('Word Length')
plt.ylabel('Frequency')
plt.title('Distribution of text Lengths')

# Adding a vertical line for the 95th quartile
quartile_95 = np.percentile(text_lengths, 95)
plt.axvline(x=quartile_95, color='red', linestyle='--', label='95th Quartile')
plt.legend()

plt.grid(True)
plt.show()

In [None]:
max_text_length = max(text_lengths)
print(f"Maximum Text Length: {max_text_length}")

In [None]:
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

# Setup text vectorization with custom variables
max_vocab_length =None # max number of words to have in our vocabulary
max_length = int(percentile_95) # max length our sequences will be (e.g. how many words from a text does our model see?)

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length)

In [None]:
text_vectorizer.adapt(X_train)

In [None]:
words_in_vocab = text_vectorizer.get_vocabulary()
top_5_words = words_in_vocab[:5] # most common tokens (notice the [UNK] token for "unknown" words)
bottom_5_words = words_in_vocab[-5:] # least common tokens
print(f"Number of words in vocab: {len(words_in_vocab)}")
print(f"Top 5 most common words: {top_5_words}")
print(f"Bottom 5 least common words: {bottom_5_words}")

In [None]:
max_vocab_length=len(words_in_vocab)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Create tokenization and modelling pipeline
model_0 = Pipeline([
                    ("tfidf", TfidfVectorizer()), # convert words to numbers using tfidf
                    ("clf", MultinomialNB()) # model the text
])

# Now fit the model
model_0.fit(X_train, y_train)

In [None]:
baseline_score = model_0.score(X_valid, y_valid)
print(f"Our baseline model achieves an accuracy of: {baseline_score*100:.2f}%")

In [None]:
baseline_preds = model_0.predict(X_valid)
baseline_preds[:20]

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):
  """
  Calculates model accuracy, precision, recall and f1 score of a binary classification model.

  Args:
  -----
  y_true = true labels in the form of a 1D array
  y_pred = predicted labels in the form of a 1D array

  Returns a dictionary of accuracy, precision, recall, f1-score.
  """
  # Calculate model accuracy
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  # Calculate model precision, recall and f1 score using "weighted" average
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results

In [None]:
baseline_results = calculate_results(y_true=y_valid,
                                     y_pred=baseline_preds)
baseline_results

In [None]:
def compare_baseline_to_new_results(baseline_results, new_model_results):
  for key, value in baseline_results.items():
    print(f"Baseline {key}: {value:.2f}, New {key}: {new_model_results[key]:.2f}, Difference: {new_model_results[key]-value:.2f}")

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint

def create_checkpoint_callback(checkpoint_path):
    """
    This function returns a ModelCheckpoint callback that saves the model's weights only when the
    validation accuracy improves.

    Parameters:
    checkpoint_path (str): The filepath where the model weights should be saved.

    Returns:
    ModelCheckpoint callback
    """
    checkpoint_callback = ModelCheckpoint(filepath=checkpoint_path,
                                          monitor='val_accuracy',
                                          mode='max',
                                          save_best_only=True,
                                          verbose=1)
    return checkpoint_callback

In [None]:
import tensorflow_hub as hub
from tensorflow.keras import layers

In [None]:
tf.random.set_seed(42)

embedding = layers.Embedding(input_dim=max_vocab_length, # set input shape
                             output_dim=300, # set size of embedding vector
                             embeddings_initializer="uniform", # default, intialize randomly
                             input_length=max_length, # how long is each input
                             name="embedding_1")

embedding

In [None]:
from tensorflow.keras import layers

inputs = layers.Input(shape=(1,), dtype="string") # inputs are 1-dimensional strings
x = text_vectorizer(inputs) # turn the input text into numbers
x = embedding(x) # create an embedding of the numerized numbers
x = layers.GlobalAveragePooling1D()(x) # lower the dimensionality of the embedding (try running the model without this layer and see what happens)
outputs = layers.Dense(1, activation="sigmoid")(x) # create the output layer, want binary outputs so use sigmoid activation
model_dense = tf.keras.Model(inputs, outputs, name="model_dense") # construct the model

In [None]:
model_dense.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [None]:
model_dense.summary()

In [None]:
checkpoint_path = "best_model_dense"

cc = create_checkpoint_callback(checkpoint_path)

In [None]:
model_dense_history = model_dense.fit(X_train, # input sentences can be a list of strings due to text preprocessing layer built-in model
                              y_train,
                              epochs=10,
                              validation_data=(X_valid, y_valid),
                              callbacks=[cc])

In [None]:
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()

# Plot the accuracy and loss history
plot_graphs(model_dense_history, 'accuracy')
plot_graphs(model_dense_history, 'loss')

In [None]:
from tensorflow.keras.models import load_model

# Load the entire model
model_dense = load_model(checkpoint_path)

In [None]:
model_dense.evaluate(X_valid, y_valid)

In [None]:
model_dense_pred_probs = model_dense.predict(X_valid)
model_dense_pred_probs[:10]

In [None]:
model_dense_preds = tf.squeeze(tf.round(model_dense_pred_probs))
model_dense_preds[:10]

In [None]:
model_dense_results = calculate_results(y_true=y_valid,
                                    y_pred=model_dense_preds)
model_dense_results

In [None]:
y_true = y_valid.tolist()  # Convert labels to a list
preds = model_dense.predict(X_valid)
y_probs = preds.squeeze().tolist()  # Store the prediction probabilities as a list
y_preds = tf.round(y_probs).numpy().tolist()

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_true=y_true,
                 y_pred=y_preds)

In [None]:
import itertools
from sklearn.metrics import confusion_matrix

# Our function needs a different name to sklearn's plot_confusion_matrix
def make_confusion_matrix(y_true, y_pred, classes=None, figsize=(10, 10), text_size=15):
  """Makes a labelled confusion matrix comparing predictions and ground truth labels.

  If classes is passed, confusion matrix will be labelled, if not, integer class values
  will be used.

  Args:
    y_true: Array of truth labels (must be same shape as y_pred).
    y_pred: Array of predicted labels (must be same shape as y_true).
    classes: Array of class labels (e.g. string form). If `None`, integer labels are used.
    figsize: Size of output figure (default=(10, 10)).
    text_size: Size of output figure text (default=15).

  Returns:
    A labelled confusion matrix plot comparing y_true and y_pred.

  Example usage:
    make_confusion_matrix(y_true=test_labels, # ground truth test labels
                          y_pred=y_preds, # predicted labels
                          classes=class_names, # array of class label names
                          figsize=(15, 15),
                          text_size=10)
  """
  # Create the confustion matrix
  cm = confusion_matrix(y_true, y_pred)
  cm_norm = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis] # normalize it
  n_classes = cm.shape[0] # find the number of classes we're dealing with

  # Plot the figure and make it pretty
  fig, ax = plt.subplots(figsize=figsize)
  cax = ax.matshow(cm, cmap=plt.cm.Blues) # colors will represent how 'correct' a class is, darker == better
  fig.colorbar(cax)

  # Are there a list of classes?
  if classes:
    labels = classes
  else:
    labels = np.arange(cm.shape[0])

  # Label the axes
  ax.set(title="Confusion Matrix",
         xlabel="Predicted label",
         ylabel="True label",
         xticks=np.arange(n_classes), # create enough axis slots for each class
         yticks=np.arange(n_classes),
         xticklabels=labels, # axes will labeled with class names (if they exist) or ints
         yticklabels=labels)

  # Make x-axis labels appear on bottom
  ax.xaxis.set_label_position("bottom")
  ax.xaxis.tick_bottom()

  # Set the threshold for different colors
  threshold = (cm.max() + cm.min()) / 2.

  # Plot the text on each cell
  for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
    plt.text(j, i, f"{cm[i, j]} ({cm_norm[i, j]*100:.1f}%)",
             horizontalalignment="center",
             color="white" if cm[i, j] > threshold else "black",
             size=text_size)

In [None]:
class_names

In [None]:
make_confusion_matrix(y_true=y_true,
                      y_pred=y_preds,
                      classes=class_names,
                      figsize=(15, 15),
                      text_size=10)

In [None]:
!pip install colorama
from colorama import Fore, Style
import numpy as np

def random_predictions(model, X_valid, y_valid, num_samples=5, class_names=None):
    # Check if it's binary or multi-class classification
    is_binary_classification = len(np.unique(y_valid)) == 2

    # Getting indices of the random samples
    random_indices = np.random.choice(np.arange(len(X_valid)), size=num_samples, replace=False)

    # Selecting the random samples
    random_X_samples = X_valid[random_indices]
    random_y_samples = y_valid[random_indices]

    # Making predictions on the random samples
    y_pred_probs = model.predict(random_X_samples)

    if is_binary_classification:
        y_pred = np.squeeze(np.round(y_pred_probs).astype(int))
    else:
        y_pred = np.argmax(y_pred_probs, axis=1)

    # Print the actual and predicted labels
    for i in range(num_samples):
        text = random_X_samples[i]
        true_label = random_y_samples[i] if is_binary_classification else np.argmax(random_y_samples[i])
        predicted_label = y_pred[i]

        # If class names are provided, use them for printing
        if class_names is not None:
            true_label_name = class_names[true_label]
            predicted_label_name = class_names[predicted_label]
        else:
            true_label_name = true_label
            predicted_label_name = predicted_label

        # Determine the color of the text (green for correct, red for incorrect)
        text_color = Fore.GREEN if true_label == predicted_label else Fore.RED

        print(f"\nSample {i + 1}:")
        print(f"Text: {text}")
        print(text_color + f"True: {true_label_name} \n Predicted: {predicted_label_name}" + Style.RESET_ALL)

In [None]:
random_predictions(model_dense,
                   X_valid,
                   y_valid,
                   num_samples=10,
                   class_names=class_names)

In [None]:
tf.random.set_seed(42)
from tensorflow.keras import layers
model_1LSTM_embedding = layers.Embedding(input_dim=max_vocab_length,
                                     output_dim=300,
                                     embeddings_initializer="uniform",
                                     input_length=max_length,
                                     name="embedding_2")


# Create LSTM model
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = model_1LSTM_embedding(x)
print(x.shape)
x = layers.LSTM(64, return_sequences=True)(x) # return vector for each word in the text (you can stack RNN cells as long as return_sequences=True)
x = layers.LSTM(64)(x) # return vector for whole sequence
print(x.shape)
x = layers.Dense(64, activation="relu")(x) # optional dense layer on top of output of LSTM cell
outputs = layers.Dense(1, activation="sigmoid")(x)
model_1LSTM = tf.keras.Model(inputs, outputs, name="model_1LSTM")


In [None]:
model_1LSTM.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [None]:
model_1LSTM.summary()

In [None]:
checkpoint_path = "best_model_Bi-LSTM"

cc = create_checkpoint_callback(checkpoint_path)

In [None]:
model_1LSTM_history = model_1LSTM.fit(X_train, y_train,
                              epochs=10,
                              validation_data=(X_valid, y_valid),
                              callbacks=[cc])

In [None]:
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()

# Plot the accuracy and loss history
plot_graphs(model_1LSTM_history, 'accuracy')
plot_graphs(model_1LSTM_history, 'loss')

In [None]:
model_1LSTM = load_model(checkpoint_path)

In [None]:
model_1LSTM.evaluate(X_valid, y_valid)

In [None]:
model_1LSTM_pred_probs = model_1LSTM.predict(X_valid)
model_1LSTM_pred_probs.shape, model_1LSTM_pred_probs[:10]

In [None]:
model_1LSTM_preds = tf.squeeze(tf.round(model_1LSTM_pred_probs))
model_1LSTM_preds[:10]

In [None]:
model_1LSTM_results = calculate_results(y_true=y_valid,
                                    y_pred=model_1LSTM_preds)
model_1LSTM_results

In [None]:
compare_baseline_to_new_results(baseline_results, model_1LSTM_results)

In [None]:
y_true = y_valid.tolist()  # Convert labels to a list
preds = model_1LSTM.predict(X_valid)
y_probs = preds.squeeze().tolist()  # Store the prediction probabilities as a list
y_preds = tf.round(y_probs).numpy().tolist()

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_true=y_true,
                 y_pred=y_preds)

In [None]:
make_confusion_matrix(y_true=y_true,
                      y_pred=y_preds,
                      classes=class_names,
                      figsize=(15, 15),
                      text_size=10)

In [None]:
random_predictions(model_1LSTM,
                   X_valid,
                   y_valid,
                   num_samples=20,
                   class_names=class_names)

In [None]:
from tensorflow.keras import layers

# Parameters
embedding_dim=128

tf.random.set_seed(42)

# Input layer
inputs = layers.Input(shape=(1,), dtype="string")

# Turn the input text into numbers
x = text_vectorizer(inputs)

# Create an embedding of the numerized numbers
x = layers.Embedding(input_dim=max_vocab_length,
                     output_dim=128,
                     embeddings_initializer="uniform",
                     input_length=max_length,
                     name="embedding_2")(x)

# Bidirectional LSTM
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
# Another LSTM Layer
x = layers.Bidirectional(layers.LSTM(64))(x)
# Dense layer
x = layers.Dense(512, activation='relu')(x)
# Output layer
outputs = layers.Dense(1, activation='sigmoid')(x)
# Create the model
model_lstm = tf.keras.Model(inputs, outputs)

In [None]:
model_lstm.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

# Print the model summary
model_lstm.summary()

In [None]:
checkpoint_path = "best_model_Bi-LSTM"

cc = create_checkpoint_callback(checkpoint_path)

In [None]:
NUM_EPOCHS = 10

# Train the model
history_lstm = model_lstm.fit(X_train, y_train, epochs=NUM_EPOCHS, validation_data=(X_valid, y_valid),callbacks=[cc])

In [None]:
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()

# Plot the accuracy and loss history
plot_graphs(history_lstm, 'accuracy')
plot_graphs(history_lstm, 'loss')

In [None]:
model_lstm = load_model(checkpoint_path)

In [None]:
model_lstm.evaluate(X_valid, y_valid)

In [None]:
model_lstm_pred_probs = model_lstm.predict(X_valid)
model_lstm_pred_probs[:10]

In [None]:
model_lstm_preds = tf.squeeze(tf.round(model_lstm_pred_probs))
model_lstm_preds[:10]

In [None]:
model_lstm_results = calculate_results(y_valid, model_lstm_preds)
model_lstm_results

In [None]:
compare_baseline_to_new_results(baseline_results, model_1LSTM_results)

In [None]:
y_true = y_valid.tolist()  # Convert labels to a list
preds = model_lstm.predict(X_valid)
y_probs = preds.squeeze().tolist()  # Store the prediction probabilities as a list
y_preds = tf.round(y_probs).numpy().tolist()

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_true=y_true,
                 y_pred=y_preds)

In [None]:
make_confusion_matrix(y_true=y_true,
                      y_pred=y_preds,
                      classes=class_names,
                      figsize=(15, 15),
                      text_size=10)

In [None]:
random_predictions(model_lstm,
                   X_valid,
                   y_valid,
                   num_samples=20,
                   class_names=class_names)

In [None]:
tf.random.set_seed(42)

from tensorflow.keras import layers
model_GRU_embedding = layers.Embedding(input_dim=max_vocab_length,
                                     output_dim=128,
                                     embeddings_initializer="uniform",
                                     input_length=max_length,
                                     name="embedding_GRU")

# Build an RNN using the GRU cell
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = model_GRU_embedding(x)
x = layers.GRU(64, return_sequences=True)(x)  # Add parentheses here
x = layers.GRU(64)(x)
x = layers.Dense(64, activation="relu")(x) # optional dense layer after GRU cell
outputs = layers.Dense(1, activation="sigmoid")(x)
model_GRU = tf.keras.Model(inputs, outputs, name="model_GRU")

In [None]:
model_GRU.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [None]:
model_GRU.summary()

In [None]:
checkpoint_path = "best_model_GRU"

cc = create_checkpoint_callback(checkpoint_path)

In [None]:
model_GRU_history = model_GRU.fit(X_train, y_train,
                              epochs=10,
                              validation_data=(X_valid, y_valid),
                              callbacks=[cc])

In [None]:
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()

# Plot the accuracy and loss history
plot_graphs(model_GRU_history, 'accuracy')
plot_graphs(model_GRU_history, 'loss')

In [None]:
model_GRU = load_model(checkpoint_path)

In [None]:
model_GRU.evaluate(X_valid, y_valid)

In [None]:
model_GRU_pred_probs = model_GRU.predict(X_valid)
model_GRU_pred_probs.shape, model_GRU_pred_probs[:10]

In [None]:
model_GRU_preds = tf.squeeze(tf.round(model_GRU_pred_probs))
model_GRU_preds[:10]

In [None]:
model_GRU_results = calculate_results(y_true=y_valid,
                                    y_pred=model_GRU_preds)
model_GRU_results

In [None]:
compare_baseline_to_new_results(baseline_results, model_GRU_results)

In [None]:
y_true = y_valid.tolist()  # Convert labels to a list
preds = model_GRU.predict(X_valid)
y_probs = preds.squeeze().tolist()  # Store the prediction probabilities as a list
y_preds = tf.round(y_probs).numpy().tolist()

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_true=y_true,
                 y_pred=y_preds)

In [None]:
make_confusion_matrix(y_true=y_true,
                      y_pred=y_preds,
                      classes=class_names,
                      figsize=(15, 15),
                      text_size=10)

In [None]:
random_predictions(model_GRU,
                   X_valid,
                   y_valid,
                   num_samples=20,
                   class_names=class_names)

In [None]:
tf.random.set_seed(42)

from tensorflow.keras import layers
model_GRU_embedding = layers.Embedding(input_dim=max_vocab_length,
                                       output_dim=128,
                                       embeddings_initializer="uniform",
                                       input_length=max_length,
                                       name="embedding_GRU")

# Build a bidirectional RNN using the GRU cell
inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = model_GRU_embedding(x)
x = layers.Bidirectional(layers.GRU(64, return_sequences=True))(x)
x = layers.Bidirectional(layers.GRU(64))(x)
x = layers.Dense(64, activation="relu")(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model_bi_GRU = tf.keras.Model(inputs, outputs, name="model_bi_GRU")

In [None]:
model_bi_GRU.compile(loss="binary_crossentropy",
                     optimizer=tf.keras.optimizers.Adam(),
                     metrics=["accuracy"])

In [None]:
model_bi_GRU.summary()

In [None]:
checkpoint_path = "best_model_bi_GRU"

cc = create_checkpoint_callback(checkpoint_path)

In [None]:
model_bi_GRU_history = model_bi_GRU.fit(X_train, y_train,
                                        epochs=10,
                                        validation_data=(X_valid, y_valid),
                                        callbacks=[cc])

In [None]:
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()

# Plot the accuracy and loss history
plot_graphs(model_GRU_history, 'accuracy')
plot_graphs(model_GRU_history, 'loss')

In [None]:
model_bi_GRU = load_model(checkpoint_path)

In [None]:
model_bi_GRU.evaluate(X_valid, y_valid)

In [None]:
model_bi_GRU_pred_probs = model_bi_GRU.predict(X_valid)
model_bi_GRU_pred_probs.shape, model_bi_GRU_pred_probs[:10]

In [None]:
model_bi_GRU_preds = tf.squeeze(tf.round(model_bi_GRU_pred_probs))
model_bi_GRU_preds[:10]

In [None]:
model_bi_GRU_results = calculate_results(y_true=y_valid,
                                    y_pred=model_bi_GRU_preds)
model_bi_GRU_results

In [None]:
compare_baseline_to_new_results(baseline_results, model_bi_GRU_results)

In [None]:
y_true = y_valid.tolist()  # Convert labels to a list
preds = model_bi_GRU.predict(X_valid)
y_probs = preds.squeeze().tolist()  # Store the prediction probabilities as a list
y_preds = tf.round(y_probs).numpy().tolist()

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_true=y_true,
                 y_pred=y_preds)

In [None]:
make_confusion_matrix(y_true=y_true,
                      y_pred=y_preds,
                      classes=class_names,
                      figsize=(15, 15),
                      text_size=10)

In [None]:
random_predictions(model_bi_GRU,
                   X_valid,
                   y_valid,
                   num_samples=20,
                   class_names=class_names)

In [None]:
from tensorflow.keras import layers

# Parameters
embedding_dim = 128
filters = 64
kernel_size = 5

tf.random.set_seed(42)

# Input layer
inputs = layers.Input(shape=(1,), dtype="string")
# Turn the input text into numbers
x = text_vectorizer(inputs)
# Create an embedding of the numerized numbers
x = layers.Embedding(input_dim=max_vocab_length,
                     output_dim=embedding_dim,
                     embeddings_initializer="uniform",
                     input_length=max_length,
                     name="embedding_2")(x)
# Conv1D layer
x = layers.Conv1D(filters, kernel_size, activation='relu')(x)
# GlobalMaxPooling1D layer
x = layers.GlobalMaxPooling1D()(x)
# Dense layer
x = layers.Dense(512, activation='relu')(x)
# Output layer
outputs = layers.Dense(1, activation='sigmoid')(x)
# Create the model
model_conv = tf.keras.Model(inputs, outputs)

In [None]:
model_conv.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print the model summary
model_conv.summary()

In [None]:
checkpoint_path = "best_model_conv"

cc = create_checkpoint_callback(checkpoint_path)

In [None]:
NUM_EPOCHS = 10

# Train the model
history_conv1d = model_conv.fit(X_train, y_train, epochs=NUM_EPOCHS, validation_data=(X_valid, y_valid),callbacks=[cc])

In [None]:
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()

# Plot the accuracy and loss history
plot_graphs(history_conv1d, 'accuracy')
plot_graphs(history_conv1d, 'loss')

In [None]:
model_conv = load_model(checkpoint_path)

In [None]:
model_conv.evaluate(X_valid, y_valid)

In [None]:
model_conv_pred_probs = model_conv.predict(X_valid)
model_conv_pred_probs[:10]

In [None]:
model_conv_preds = tf.squeeze(tf.round(model_conv_pred_probs))
model_conv_preds[:10]

In [None]:
model_conv_results = calculate_results(y_valid, model_conv_preds)
model_conv_results

In [None]:
compare_baseline_to_new_results(baseline_results, model_conv_results)

In [None]:
y_true = y_valid.tolist()  # Convert labels to a list
preds = model_conv.predict(X_valid)
y_probs = preds.squeeze().tolist()  # Store the prediction probabilities as a list
y_preds = tf.round(y_probs).numpy().tolist()

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_true=y_true,
                 y_pred=y_preds)

In [None]:
make_confusion_matrix(y_true=y_true,
                      y_pred=y_preds,
                      classes=class_names,
                      figsize=(15, 15),
                      text_size=10)

In [None]:
random_predictions(model_conv,
                   X_valid,
                   y_valid,
                   num_samples=20,
                   class_names=class_names)

In [None]:
sentence_encoder_layer = hub.KerasLayer("https://tfhub.dev/google/universal-sentence-encoder/4",
                                        input_shape=[], # shape of inputs coming to our model
                                        dtype=tf.string, # data type of inputs coming to the USE layer
                                        trainable=False, # keep the pretrained weights (we'll create a feature extractor)
                                        name="USE")

In [None]:
tf.random.set_seed(42)

# Create model using the Sequential API
model_USE = tf.keras.Sequential([
sentence_encoder_layer, # take in sentences and then encode them into an embedding
layers.Dense(512, activation="relu"),
layers.Dense(1, activation="sigmoid")
], name="model_USE")

# Compile model
model_USE.compile(loss="binary_crossentropy",
              optimizer=tf.keras.optimizers.Adam(),
              metrics=["accuracy"])

model_USE.summary()

In [None]:
checkpoint_path = "best_model_USE"

# Create a ModelCheckpoint callback that saves the model's weights only when the validation accuracy improves
cc = ModelCheckpoint(filepath=checkpoint_path,
                                      monitor='val_accuracy',
                                      mode='max',
                                      save_best_only=True,
                                      verbose=1)

In [None]:
model_USE_history = model_USE.fit(X_train,
                              y_train,
                              epochs=10,
                              validation_data=(X_valid, y_valid),
                              callbacks=[cc])

In [None]:
model_USE = load_model(checkpoint_path)

In [None]:
model_USE.evaluate(X_valid, y_valid)

In [None]:
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()

# Plot the accuracy and loss history
plot_graphs(model_USE_history, 'accuracy')
plot_graphs(model_USE_history, 'loss')

In [None]:
model_USE_pred_probs = model_USE.predict(X_valid)
model_USE_pred_probs[:10]

In [None]:
model_USE_preds = tf.squeeze(tf.round(model_USE_pred_probs))
model_USE_preds[:10]

In [None]:
model_USE_results = calculate_results(y_valid, model_USE_preds)
model_USE_results

In [None]:
compare_baseline_to_new_results(baseline_results, model_USE_results)

In [None]:
y_true = y_valid.tolist()  # Convert labels to a list
preds = model_USE.predict(X_valid)
y_probs = preds.squeeze().tolist()  # Store the prediction probabilities as a list
y_preds = tf.round(y_probs).numpy().tolist()

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_true=y_true,
                 y_pred=y_preds)

In [None]:
make_confusion_matrix(y_true=y_true,
                      y_pred=y_preds,
                      classes=class_names,
                      figsize=(15, 15),
                      text_size=10)

In [None]:
random_predictions(model_USE,
                   X_valid,
                   y_valid,
                   num_samples=20,
                   class_names=class_names)

In [None]:
all_model_results = pd.DataFrame({"baseline": baseline_results,
                                  "Simple Dense": model_dense_results,
                                  "LSTM": model_1LSTM_results,
                                  "Bidirectional LSTM": model_lstm_results,
                                  "GRU": model_GRU_results,
                                  "Bidirectional GRU": model_bi_GRU_results,
                                  "Conv1D": model_conv_results,
                                  "USE": model_USE_results,
                                  })

all_model_results = all_model_results.transpose()
all_model_results["accuracy"] = all_model_results["accuracy"]/100
all_model_results

In [None]:
import matplotlib.pyplot as plt

def plot_graphs(history, string, model_name):
  plt.figure(figsize=(12, 4))
  plt.subplot(1, 2, 1)
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.title(f"{model_name} - {string} vs. Validation {string}")

  plt.subplot(1, 2, 2)
  plt.plot(history.history['loss'])
  plt.plot(history.history['val_loss'])
  plt.xlabel("Epochs")
  plt.ylabel('Loss')
  plt.legend(['loss', 'val_loss'])
  plt.title(f"{model_name} - Loss vs. Validation Loss")

  plt.tight_layout()
  plt.show()

# Call the function for each model's history
plot_graphs(model_dense_history, 'accuracy', 'Dense Network')
plot_graphs(model_1LSTM_history, 'accuracy', 'LSTM')
plot_graphs(history_lstm, 'accuracy', 'Bidirectional LSTM')
plot_graphs(model_GRU_history, 'accuracy', 'GRU')
plot_graphs(model_bi_GRU_history, 'accuracy', 'Bidirectional GRU')
plot_graphs(history_conv1d, 'accuracy', 'Conv1D')
plot_graphs(model_USE_history, 'accuracy', 'USE')

In [None]:
all_model_results_sorted = all_model_results.sort_values("f1", ascending=False)
all_model_results_sorted

In [None]:
all_model_results.plot(kind="bar", figsize=(10, 7)).legend(bbox_to_anchor=(1.0, 1.0));

In [None]:
all_model_results.sort_values("f1", ascending=False)["f1"].plot(kind="bar", figsize=(10, 7));

In [None]:
y_true = y_valid.tolist()  # Convert labels to a list
preds = model_GRU.predict(X_valid)
y_probs = preds.squeeze().tolist()  # Store the prediction probabilities as a list
y_preds = tf.round(y_probs).numpy().tolist()

In [None]:
from sklearn.metrics import classification_report, accuracy_score, f1_score, recall_score, precision_score

report = classification_report(y_true, y_preds)
print(report)

In [None]:
textx = "pretty much decided kill repent thing done way place selfpity feel remorseful "

In [None]:
textx2 = "i love my birthday"

In [None]:
textx3 = "Hmm not really sure about this one,i'm so depressed"

In [None]:
def predict_on_sentence(model, sentence, category_reverse_mapping):
    """
    Uses model to make a prediction on sentence.

    Returns the sentence, the predicted labels and the prediction probabilities.
    """
    pred_prob = model.predict([sentence])
    pred_label = np.round(pred_prob).astype(int)[0]  # Converting to int to match the format of your labels

    # Get the label names of the predicted class
    pred_label_str = category_reverse_mapping[pred_label[0]]  # Use the first element of pred_label
    pred_prob_str = pred_prob[0][0]

    print(f"Prediction: {pred_label_str}")  # Print the predicted label
    print(f"Prediction probability: {pred_prob_str}")  # Print the prediction probabilities
    print(f"Text:\n{sentence}")

In [None]:
predict_on_sentence(model=model_GRU, # use the GRU model
                    sentence=textx,
                    category_reverse_mapping=class_names)

In [None]:
predict_on_sentence(model=model_GRU, # use the GRU model
                    sentence=textx2,
                    category_reverse_mapping=class_names)

In [None]:
predict_on_sentence(model=model_GRU, # use the GRU model
                    sentence=textx3,
                    category_reverse_mapping=class_names)

In [None]:
val_df = pd.DataFrame({
    "text": X_valid,
    "target": [class_names[np.argmax(arr)] for arr in y_valid],
    "target_label": [np.argmax(arr) for arr in y_valid],
    "pred": [class_names[np.round(prob).astype(int)] for prob in y_preds],
    "pred_label": [np.round(prob).astype(int) for prob in y_preds],
    "pred_prob": y_preds
})

val_df.head()

In [None]:
most_wrong = val_df[val_df["target"] != val_df["pred"]].sort_values("pred_prob", ascending=False)
most_wrong[:10]

In [None]:
for row in most_wrong[:10].itertuples(): # loop through the top 10 rows (change the index to view different rows)
  _, text, target, target_label, pred, pred_label, prob = row
  print(f"Target: {target}, Pred: {pred}, Prob: {prob}")
  print(f"Text:\n{text}\n")
  print("----\n")

In [None]:
for row in most_wrong[-10:].itertuples():
  _, text, target, target_label, pred, pred_label, prob = row
  print(f"Target: {target}, Pred: {pred}, Prob: {prob}")
  print(f"Text:\n{text}\n")
  print("----\n")

In [None]:
import time
def pred_timer(model, samples):
  """
  Times how long a model takes to make predictions on samples.

  Args:
  ----
  model = a trained model
  sample = a list of samples

  Returns:
  ----
  total_time = total elapsed time for model to make predictions on samples
  time_per_pred = time in seconds per single sample
  """
  start_time = time.perf_counter() # get start time
  model.predict(samples) # make predictions
  end_time = time.perf_counter() # get finish time
  total_time = end_time-start_time # calculate how long predictions took to make
  time_per_pred = total_time/len(X_valid) # find prediction time per sample
  return total_time, time_per_pred

In [None]:
model_total_pred_time, model_time_per_pred = pred_timer(model_GRU, X_valid)
model_total_pred_time, model_time_per_pred

In [None]:
baseline_total_pred_time, baseline_time_per_pred = pred_timer(model_0, X_valid)
baseline_total_pred_time, baseline_time_per_pred

In [None]:
plt.figure(figsize=(10, 7))
plt.scatter(baseline_time_per_pred, baseline_results["f1"], label="baseline")
plt.scatter(model_time_per_pred, model_GRU_results["f1"], label="model GRU")
plt.legend()
plt.title("F1-score versus time per prediction")
plt.xlabel("Time per prediction")
plt.ylabel("F1-Score");

In [None]:
baseline_pred_probs = np.max(model_0.predict_proba(X_valid), axis=1) # get the prediction probabilities from baseline model
combined_pred_probs = baseline_pred_probs + tf.squeeze(model_dense_pred_probs, axis=1) + tf.squeeze(model_lstm_pred_probs)
combined_preds = tf.round(combined_pred_probs/3) # average and round the prediction probabilities to get prediction classes
combined_preds[:20]

In [None]:
combined_pred_probs = tf.squeeze(model_GRU_pred_probs, axis=1) + tf.squeeze(model_USE_pred_probs, axis=1) + tf.squeeze(model_lstm_pred_probs)
combined_preds = tf.round(combined_pred_probs/3) # average and round the prediction probabilities to get prediction classes
combined_preds[:20]

In [None]:
ensemble_results = calculate_results(y_valid, combined_preds)
ensemble_results

In [None]:
all_model_results.loc["ensemble_results"] = ensemble_results

In [None]:
all_model_results.loc["ensemble_results"]["accuracy"] = all_model_results.loc["ensemble_results"]["accuracy"]/100
all_model_results

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Assuming you have a DataFrame named 'all_model_results' with accuracy, precision, recall, and F1 values

# Define custom colors for each model
model_colors = {
    "baseline": '#1f77b4',
    "Simple Dense": '#ff7f0e',
    "LSTM": '#2ca02c',
    "Bidirectional LSTM": '#d62728',
    "GRU": '#9467bd',
    "Bidirectional GRU": '#8c564b',
    "Conv1D": '#e377c2',
    "USE": '#7f7f7f',
    "ensemble_results": '#17becf'
}

# Create a horizontal bar plot for each metric
metrics = ['accuracy', 'precision', 'recall', 'f1']

for metric in metrics:
    fig, ax = plt.subplots(figsize=(10, 6))  # Adjust figure size as needed
    index = range(len(all_model_results))

    # Plot the selected metric
    ax.barh([pos for pos in index], all_model_results[metric], color=[model_colors[model] for model in all_model_results.index])

    # Annotate the values on the bars with custom formatting
    for j, value in enumerate(all_model_results[metric]):
        ax.text(value + 0.01, j, f'{value:.2f}', ha='center', va='center', fontsize=12, fontweight='bold', color='black')

    # Set the y-axis labels to be the model names with improved formatting
    ax.set_yticks(index)
    ax.set_yticklabels(all_model_results.index, fontsize=12, fontweight='bold')

    plt.title(f'{metric.capitalize()} Scores For Various Models', fontsize=16, fontweight='bold')
    plt.xlabel('Percentage', fontsize=14, fontweight='bold')
    plt.ylabel('Models', fontsize=14, fontweight='bold')
    plt.grid(axis='x', linestyle='--', alpha=0.6)
    plt.gca().invert_yaxis()  # Invert the y-axis for better readability
    plt.tight_layout()

    # Save or display the individual metric graphs
    plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming you have a DataFrame named 'all_model_results' with accuracy, precision, recall, and F1 values

# Create a horizontal bar plot with more spacing between bars
fig, ax = plt.subplots(figsize=(10, 8))
bar_width = 0.2
index = range(len(all_model_results))

# Plot each evaluation metric separately
for i, metric in enumerate(['accuracy', 'precision', 'recall', 'f1']):
    ax.barh([pos + i * bar_width for pos in index], all_model_results[metric], bar_width, label=metric.capitalize())

# Annotate the values on the bars
for i, metric in enumerate(['accuracy', 'precision', 'recall', 'f1']):
    for j, value in enumerate(all_model_results[metric]):
        ax.text(value + 0.01, j + i * bar_width, f'{value:.2f}', ha='center', va='center')

# Set the y-axis labels to be the model names
ax.set_yticks([pos + 1.5 * bar_width for pos in index])
ax.set_yticklabels(all_model_results.index)

plt.title('Evaluation Scores For Various Models')
plt.xlabel('Percentage')
plt.ylabel('Models')
plt.legend(loc='upper left')
plt.grid(axis='x', linestyle='--', alpha=0.6)
plt.gca().invert_yaxis()  # Invert the y-axis for better readability
plt.tight_layout()
plt.show()


# **Depression and Anxiety Prediction**

In [None]:
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score, recall_score, f1_score,roc_auc_score,classification_report,mean_squared_error

In [None]:
data=pd.read_csv('/kaggle/input/depression-anxiety-stress-scales-responses/data.csv',delimiter='\t')
data.head()

In [None]:
data_1=data.copy()
data_1['major']=data_1['major'].replace(np.nan,'No Degree')
time = [i for i in data_1.iloc[:,0:126] if  'E' in i]
position = [i for i in data_1.iloc[:,0:126] if  'I' in i]
data_1=data_1.drop(position,axis=1)
data_1=data_1.drop(time,axis=1)
data_1=data_1.drop(data_1.iloc[:,43:47],axis=1)
data_2=data_1.copy()
data_2=data_2.drop(data_2.iloc[:,53:69],axis=1)
data_2=data_2.replace(to_replace=0,value=3)
data_2=data_2.rename(columns={'TIPI1':'Extraverted-enthusiastic','TIPI2':'Critical-quarrelsome',
                            'TIPI3':'Dependable-self_disciplined','TIPI4':'Anxious-easily upset',
                            'TIPI5':'Open to new experiences-complex','TIPI6':'Reserved-quiet',
                            'TIPI7':'Sympathetic-warm','TIPI8':'Disorganized-careless','TIPI9':'Calm-emotionally_stable',
                            'TIPI10':'Conventional-uncreative'})
print('Shape',data_2.shape)
print('Attributes',data_2.columns)

In [None]:
data_2=data_2.replace([np.inf, -np.inf], np.nan)
data_2=data_2.dropna()

In [None]:
def condition(x):
    if x<=10:
        return 'Under 10'
    if  10<=x<=16:
        return ' Primary Children'
    if 17<=x<=21:
        return 'Secondary Children'
    if 21<=x<=35:
        return 'Adults'
    if 36<=x<=48:
        return 'Elder Adults'
    if x>=49:
        return 'Older People'

data_2['Age_Groups']=data_2['age'].apply(condition)
data_2.head()

In [None]:
new_data=data_2.iloc[:,42:]
data_3=data_2.filter(regex='Q\d{1,2}A')
data_3.head()

In [None]:
def sub(data_3):
    return data_3.subtract(1,axis=1)
data_3=sub(data_3)
DASS_keys = {'Depression': [3, 5, 10, 13, 16, 17, 21, 24, 26, 31, 34, 37, 38, 42],
             'Anxiety': [2, 4, 7, 9, 15, 19, 20, 23, 25, 28, 30, 36, 40, 41],
             'Stress': [1, 6, 8, 11, 12, 14, 18, 22, 27, 29, 32, 33, 35, 39]}
Dep = []
for i in DASS_keys["Depression"]:
    Dep.append('Q'+str(i)+'A')
Stress = []
for i in DASS_keys["Stress"]:
    Stress.append('Q'+str(i)+'A')
Anx = []
for i in DASS_keys["Anxiety"]:
    Anx.append('Q'+str(i)+'A')
depression= data_3.filter(Dep)
stress = data_3.filter(Stress)
anxiety = data_3.filter(Anx)

In [None]:
def scores(source):
    col=list(source)
    source['Total_Count']=source[col].sum(axis=1)
    return source
depression=scores(depression)
stress=scores(stress)
anxiety=scores(anxiety)

In [None]:
Depression=pd.merge(depression,new_data,how='left',left_index=True,right_index=True)
Depression.head()

In [None]:
Stress=pd.merge(stress,new_data,how='inner',left_index=True,right_index=True)
Stress.head()

In [None]:
Anxiety=pd.merge(anxiety,new_data,how='inner',left_index=True,right_index=True)
Anxiety.head()

In [None]:
def condition(x):
    if x<=9:
        return 'Normal'
    if  10<=x<=13:
        return 'Mild'
    if 14<=x<=20:
        return 'Moderate'
    if 21<=x<=27:
        return 'Severe'
    if x>28:
        return 'Extremely Severe'

Depression['Condition']=Depression['Total_Count'].apply(condition)
Depression.head()

In [None]:
def condition(x):
    if x<=14:
        return 'Normal'
    if  15<=x<=18:
        return 'Mild'
    if 19<=x<=25:
        return 'Moderate'
    if 26<=x<=33:
        return 'Severe'
    if x>=34:
        return 'Extremely Severe'

Stress['Condition']=Stress['Total_Count'].apply(condition)

Stress.head()

In [None]:
def condition(x):
    if x<=7:
        return 'Normal'
    if  8<=x<=9:
        return 'Mild'
    if 10<=x<=14:
        return 'Moderate'
    if 15<=x<=19:
        return 'Severe'
    if x>19:
        return 'Extremely Severe'

Anxiety['Condition']=Anxiety['Total_Count'].apply(condition)
Anxiety.head()

In [None]:
def change_var(x):
    if x=='Primary Children':
        return 0
    elif x=='Secondary Children':
        return 1
    elif x=='Adults':
        return 2
    elif x=='Elder Adults':
        return 3
    elif x=='Older People':
        return 4



Depression['Age_Groups']=Depression['Age_Groups'].apply(change_var)
Stress['Age_Groups']=Stress['Age_Groups'].apply(change_var)
Anxiety['Age_Groups']=Anxiety['Age_Groups'].apply(change_var)

In [None]:
Depression=Depression.dropna()
Stress=Stress.dropna()
Anxiety=Anxiety.dropna()

In [None]:
Depression=Depression.drop(columns=['Total_Count','country','age'])
Stress=Stress.drop(columns=['Total_Count','country','age'])
Anxiety=Anxiety.drop(columns=['Total_Count','country','age'])

In [None]:
scaler=MinMaxScaler()
X=Depression[['Q3A', 'Q5A', 'Q10A', 'Q13A', 'Q16A', 'Q17A', 'Q21A', 'Q24A', 'Q26A',
       'Q31A', 'Q34A', 'Q37A', 'Q38A', 'Q42A',
       'Extraverted-enthusiastic', 'Critical-quarrelsome',
       'Dependable-self_disciplined', 'Anxious-easily upset',
        'Open to new experiences-complex', 'Reserved-quiet', 'Sympathetic-warm',
        'Disorganized-careless', 'Calm-emotionally_stable',
        'Conventional-uncreative', 'education', 'urban', 'gender', 'engnat',
        'screensize', 'uniquenetworklocation', 'hand', 'religion',
        'orientation', 'race', 'voted', 'married', 'familysize',
        'Age_Groups']]
y=Depression[['Condition']]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.10,random_state=0)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
print('Training Set:',X_train.shape,y_train.shape)
print('Test Set:',X_test.shape,y_test.shape)

In [None]:
sns.displot(X_train_scaled)

In [None]:
sns.displot(X_test_scaled)

In [None]:
RanFor=RandomForestClassifier(n_estimators=190,min_samples_split=3,min_samples_leaf=1,max_depth=160,max_features='auto').fit(X_train_scaled,y_train)
Acc_ran=round(accuracy_score(y_test,RanFor.predict(X_test_scaled)),3)
f1_ran=round(f1_score(y_test,RanFor.predict(X_test_scaled),average='weighted'),3)
recall_ran=round(recall_score(y_test,RanFor.predict(X_test_scaled),average='weighted'),3)
precision_ran=round(precision_score(y_test,RanFor.predict(X_test_scaled),average='weighted'),3)
print('Accuracy:',Acc_ran)
print('F1_Score:',f1_ran)
print('Recall_Score:',recall_ran)
print('Precision_Score:',precision_ran)
print('Cross Validation Score:',round(np.mean(cross_val_score(RanFor, X_train_scaled, y_train, cv = 6)),3))
classification=classification_report(
    digits=4,
    y_true=y_test,
    y_pred=RanFor.predict(X_test_scaled))
print(classification)
confusion = confusion_matrix(y_test, RanFor.predict(X_test_scaled))


fig, ax = plt.subplots(figsize=(10, 10))
im = ax.imshow(confusion, cmap=plt.cm.Blues)
for i in range(len(confusion)):
    for j in range(len(confusion[i])):
        text = ax.text(j, i, str(confusion[i, j]),
                       ha="center", va="center", color="black")

plt.colorbar(im)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
DT=DecisionTreeClassifier(criterion='entropy',splitter='best',min_samples_split=9,min_samples_leaf=2,max_depth=100).fit(X_train_scaled,y_train)
Acc_dt=round(accuracy_score(y_test,DT.predict(X_test_scaled)),3)
f1_dt=round(f1_score(y_test,DT.predict(X_test_scaled),average='weighted'),3)
recall_dt=round(recall_score(y_test,DT.predict(X_test_scaled),average='weighted'),3)
precision_dt=round(precision_score(y_test,DT.predict(X_test_scaled),average='weighted'),3)
print('Accuracy:',Acc_dt)
print('F1_Score:',f1_dt)
print('Recall_Score:',recall_dt)
print('Precision_Score:',precision_dt)
print('Cross Validation Score:',round(np.mean(cross_val_score(DT, X_train_scaled, y_train, cv = 6)),3))
classification=classification_report(
    digits=4,
    y_true=y_test,
    y_pred=DT.predict(X_test_scaled))
print(classification)
fig, ax = plt.subplots(figsize=(10, 10))
im = ax.imshow(confusion, cmap=plt.cm.Blues)
for i in range(len(confusion)):
    for j in range(len(confusion[i])):
        text = ax.text(j, i, str(confusion[i, j]),
                       ha="center", va="center", color="black")

plt.colorbar(im)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
gb=GaussianNB().fit(X_train_scaled,y_train)
Acc_gb=round(accuracy_score(y_test,gb.predict(X_test_scaled)),3)
f1_gb=round(f1_score(y_test,gb.predict(X_test_scaled),average='weighted'),3)
recall_gb=round(recall_score(y_test,gb.predict(X_test_scaled),average='weighted'),3)
precision_gb=round(precision_score(y_test,gb.predict(X_test_scaled),average='weighted'),3)
print('Accuracy:',Acc_gb)
print('F1_Score:',f1_gb)
print('Recall_Score:',recall_gb)
print('Precision_Score:',precision_gb)
print('Cross Validation Score:',round(np.mean(cross_val_score(gb, X_train_scaled, y_train, cv = 6)),3))
classification=classification_report(
    digits=4,
    y_true=y_test,
    y_pred=gb.predict(X_test_scaled))
print(classification)
fig, ax = plt.subplots(figsize=(10, 10))
im = ax.imshow(confusion, cmap=plt.cm.Blues)
for i in range(len(confusion)):
    for j in range(len(confusion[i])):
        text = ax.text(j, i, str(confusion[i, j]),
                       ha="center", va="center", color="black")

plt.colorbar(im)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
knn=KNeighborsClassifier(n_neighbors=15).fit(X_train_scaled,y_train)
Acc_knn=round(accuracy_score(y_test,knn.predict(X_test_scaled)),3)
f1_knn=round(f1_score(y_test,knn.predict(X_test_scaled),average='weighted'),3)
recall_knn=round(recall_score(y_test,knn.predict(X_test_scaled),average='weighted'),3)
precision_knn=round(precision_score(y_test,knn.predict(X_test_scaled),average='weighted'),3)
print('Accuracy:',Acc_knn)
print('F1_Score:',f1_knn)
print('Recall_Score:',recall_knn)
print('Precision_Score:',precision_knn)
print('Cross Validation Score:',round(np.mean(cross_val_score(knn, X_train_scaled, y_train, cv = 6)),3))
classification=classification_report(
    digits=4,
    y_true=y_test,
    y_pred=knn.predict(X_test_scaled))
print(classification)
fig, ax = plt.subplots(figsize=(10, 10))
im = ax.imshow(confusion, cmap=plt.cm.Blues)
for i in range(len(confusion)):
    for j in range(len(confusion[i])):
        text = ax.text(j, i, str(confusion[i, j]),
                       ha="center", va="center", color="black")

plt.colorbar(im)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
from sklearn.svm import SVC
svm=SVC(C=100,gamma=0.1,kernel='rbf').fit(X_train_scaled,y_train)
Acc_svm=round(accuracy_score(y_test,svm.predict(X_test_scaled)),3)
f1_svm=round(f1_score(y_test,svm.predict(X_test_scaled),average='weighted'),3)
recall_svm=round(recall_score(y_test,svm.predict(X_test_scaled),average='weighted'),3)
precision_svm=round(precision_score(y_test,svm.predict(X_test_scaled),average='weighted'),3)
print('Accuracy:',Acc_svm)
print('F1_Score:',f1_svm)
print('Recall_Score:',recall_svm)
print('Precision_Score:',precision_svm)
print('Cross Validation Score:',round(np.mean(cross_val_score(svm, X_train_scaled, y_train, cv = 6)),3))
classification=classification_report(
    digits=4,
    y_true=y_test,
    y_pred=svm.predict(X_test_scaled))
print(classification)
fig, ax = plt.subplots(figsize=(10, 10))
im = ax.imshow(confusion, cmap=plt.cm.Blues)
for i in range(len(confusion)):
    for j in range(len(confusion[i])):
        text = ax.text(j, i, str(confusion[i, j]),
                       ha="center", va="center", color="black")

plt.colorbar(im)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
Result={
                  'Model':['Random-Forest','Decision-Tree','GaussianNB','Nearest-Neighbors','SVM'],
                  'Accuracy(%)':[Acc_ran*100,Acc_dt*100,Acc_gb*100,Acc_knn*100,Acc_svm*100],
                  'F1_Score(%)':[f1_ran*100,f1_dt*100,f1_gb*100,f1_knn*100,f1_svm*100],
                  'Precision(%)':[precision_ran*100,precision_dt*100,precision_gb*100,precision_knn*100,precision_svm*100],
                  'Recall(%)':[recall_ran*100,recall_dt*100,recall_gb*100,recall_knn*100,recall_svm*100],
                    }
Result_Depression=pd.DataFrame(Result)

In [None]:
scaler=MinMaxScaler()
X=Stress[['Q1A', 'Q6A', 'Q8A', 'Q11A', 'Q12A', 'Q14A', 'Q18A', 'Q22A', 'Q27A',
       'Q29A', 'Q32A', 'Q33A', 'Q35A', 'Q39A',
       'Extraverted-enthusiastic', 'Critical-quarrelsome',
       'Dependable-self_disciplined', 'Anxious-easily upset',
        'Open to new experiences-complex', 'Reserved-quiet', 'Sympathetic-warm',
        'Disorganized-careless', 'Calm-emotionally_stable',
        'Conventional-uncreative', 'education', 'urban', 'gender', 'engnat',
        'screensize', 'uniquenetworklocation', 'hand', 'religion',
        'orientation', 'race', 'voted', 'married', 'familysize',
        'Age_Groups']]
y=Stress[['Condition']]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.10,random_state=0)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
ran=RandomForestClassifier().fit(X_train_scaled,y_train)
acc_ran=round(accuracy_score(y_test,ran.predict(X_test_scaled)),3)
F1_ran=round(f1_score(y_test,ran.predict(X_test_scaled),average='weighted'),3)
prec_ran=round(precision_score(y_test,ran.predict(X_test_scaled),average='weighted'),3)
rec_ran=round(recall_score(y_test,ran.predict(X_test_scaled),average='weighted'),3)
print('Accuracy:',acc_ran)
print('F1_Score:',F1_ran)
print('Recall_Score:',rec_ran)
print('Precision_Score:',prec_ran)
print('Cross Validation Score:',(np.mean(cross_val_score(ran, X_train_scaled, y_train, cv = 6))))
classification=classification_report(
    digits=4,
    y_true=y_test,
    y_pred=ran.predict(X_test_scaled))
print(classification)
fig, ax = plt.subplots(figsize=(10, 10))
im = ax.imshow(confusion, cmap=plt.cm.Blues)
for i in range(len(confusion)):
    for j in range(len(confusion[i])):
        text = ax.text(j, i, str(confusion[i, j]),
                       ha="center", va="center", color="black")

plt.colorbar(im)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
dt=DecisionTreeClassifier(criterion= 'entropy',max_depth= 80, min_samples_leaf= 9, min_samples_split=9).fit(X_train_scaled,y_train)
acc_dt=round(accuracy_score(y_test,dt.predict(X_test_scaled)),3)
F1_dt=round(f1_score(y_test,dt.predict(X_test_scaled),average='weighted'),3)
prec_dt=round(precision_score(y_test,dt.predict(X_test_scaled),average='weighted'),3)
rec_dt=round(recall_score(y_test,dt.predict(X_test_scaled),average='weighted'),3)
print('Accuracy:',acc_dt)
print('F1_Score:',F1_dt)
print('Recall_Score:',rec_dt)
print('Precision_Score:',prec_dt)
print('Cross Validation Score:',(np.mean(cross_val_score(dt, X_train_scaled, y_train, cv = 6))))
classification=classification_report(
    digits=4,
    y_true=y_test,
    y_pred=dt.predict(X_test_scaled))
print(classification)
fig, ax = plt.subplots(figsize=(10, 10))
im = ax.imshow(confusion, cmap=plt.cm.Blues)
for i in range(len(confusion)):
    for j in range(len(confusion[i])):
        text = ax.text(j, i, str(confusion[i, j]),
                       ha="center", va="center", color="black")

plt.colorbar(im)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
Gb=GaussianNB().fit(X_train_scaled,y_train)
acc_gb=round(accuracy_score(y_test,Gb.predict(X_test_scaled)),3)
F1_gb=round(f1_score(y_test,Gb.predict(X_test_scaled),average='weighted'),3)
prec_gb=round(precision_score(y_test,Gb.predict(X_test_scaled),average='weighted'),3)
rec_gb=round(recall_score(y_test,Gb.predict(X_test_scaled),average='weighted'),3)
print('Accuracy:',acc_gb)
print('F1_Score:',F1_gb)
print('Recall_Score:',rec_gb)
print('Precision_Score:',prec_gb)
print('Cross Validation Score:',(np.mean(cross_val_score(Gb, X_train_scaled, y_train, cv = 6))))
classification=classification_report(
    digits=4,
    y_true=y_test,
    y_pred=Gb.predict(X_test_scaled))
print(classification)
fig, ax = plt.subplots(figsize=(10, 10))
im = ax.imshow(confusion, cmap=plt.cm.Blues)
for i in range(len(confusion)):
    for j in range(len(confusion[i])):
        text = ax.text(j, i, str(confusion[i, j]),
                       ha="center", va="center", color="black")

plt.colorbar(im)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
Knn=KNeighborsClassifier(n_neighbors=15).fit(X_train_scaled,y_train)
acc_knn=round(accuracy_score(y_test,Knn.predict(X_test_scaled)),3)
F1_knn=round(f1_score(y_test,Knn.predict(X_test_scaled),average='weighted'),3)
prec_knn=round(precision_score(y_test,Knn.predict(X_test_scaled),average='weighted'),3)
rec_knn=round(recall_score(y_test,Knn.predict(X_test_scaled),average='weighted'),3)
print('Accuracy:',acc_knn)
print('F1_Score:',F1_knn)
print('Recall_Score:',rec_knn)
print('Precision_Score:',prec_knn)
print('Cross Validation Score:',(np.mean(cross_val_score(Knn, X_train_scaled, y_train, cv = 6))))
classification=classification_report(
    digits=4,
    y_true=y_test,
    y_pred=Knn.predict(X_test_scaled))
print(classification)
fig, ax = plt.subplots(figsize=(10, 10))
im = ax.imshow(confusion, cmap=plt.cm.Blues)
for i in range(len(confusion)):
    for j in range(len(confusion[i])):
        text = ax.text(j, i, str(confusion[i, j]),
                       ha="center", va="center", color="black")

plt.colorbar(im)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
Svm=SVC(C=10, gamma=0.01, kernel='sigmoid').fit(X_train_scaled,y_train)
acc_svm=round(accuracy_score(y_test,Svm.predict(X_test_scaled)),3)
F1_svm=round(f1_score(y_test,Svm.predict(X_test_scaled),average='weighted'),3)
prec_svm=round(precision_score(y_test,Svm.predict(X_test_scaled),average='weighted'),3)
rec_svm=round(recall_score(y_test,Svm.predict(X_test_scaled),average='weighted'),3)
print('Accuracy:',acc_svm)
print('F1_Score:',F1_svm)
print('Recall_Score:',rec_svm)
print('Precision_Score:',prec_svm)
print('Cross Validation Score:',(np.mean(cross_val_score(Svm, X_train_scaled, y_train, cv = 6))))
classification=classification_report(
    digits=4,
    y_true=y_test,
    y_pred=Svm.predict(X_test_scaled))
print(classification)
fig, ax = plt.subplots(figsize=(10, 10))
im = ax.imshow(confusion, cmap=plt.cm.Blues)
for i in range(len(confusion)):
    for j in range(len(confusion[i])):
        text = ax.text(j, i, str(confusion[i, j]),
                       ha="center", va="center", color="black")

plt.colorbar(im)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
Result_1={
                  'Model':['Random-Forest','Decision-Tree','GaussianNB','Nearest-Neighbors','SVM'],
                  'Accuracy(%)':[acc_ran*100,acc_dt*100,acc_gb*100,acc_knn*100,acc_svm*100],
                  'F1_Score(%)':[F1_ran*100,F1_dt*100,F1_gb*100,F1_knn*100,F1_svm*100],
                  'Precision(%)':[prec_ran*100,prec_dt*100,prec_gb*100,prec_knn*100,prec_svm*100],
                  'Recall(%)':[rec_ran*100,rec_dt*100,rec_gb*100,rec_knn*100,rec_svm*100]
                    }
Result_Stress=pd.DataFrame(Result_1)

In [None]:
scaler=MinMaxScaler()
X=Anxiety[['Q2A', 'Q4A', 'Q7A', 'Q9A', 'Q15A', 'Q19A', 'Q20A', 'Q23A', 'Q25A',
       'Q28A', 'Q30A', 'Q36A', 'Q40A', 'Q41A',
       'Extraverted-enthusiastic', 'Critical-quarrelsome',
       'Dependable-self_disciplined', 'Anxious-easily upset',
        'Open to new experiences-complex', 'Reserved-quiet', 'Sympathetic-warm',
        'Disorganized-careless', 'Calm-emotionally_stable',
        'Conventional-uncreative', 'education', 'urban', 'gender', 'engnat',
        'screensize', 'uniquenetworklocation', 'hand', 'religion',
        'orientation', 'race', 'voted', 'married', 'familysize',
        'Age_Groups']]
y=Anxiety[['Condition']]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.10,random_state=0)
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
rand=RandomForestClassifier(criterion='entropy').fit(X_train_scaled,y_train)
accu_ran=round(accuracy_score(y_test,rand.predict(X_test_scaled)),3)
f1_score_ran=round(f1_score(y_test,rand.predict(X_test_scaled),average='weighted'),3)
Precision_ran=round(precision_score(y_test,rand.predict(X_test_scaled),average='weighted'),3)
Recall_ran=round(recall_score(y_test,rand.predict(X_test_scaled),average='weighted'),3)
print('Accuracy:',accu_ran)
print('F1_Score:',f1_score_ran)
print('Recall_Score:',Precision_ran)
print('Precision_Score:',Recall_ran)
print('Cross Validation Score:',(np.mean(cross_val_score(rand, X_train_scaled, y_train, cv = 6))))
classification=classification_report(
    digits=4,
    y_true=y_test,
    y_pred=rand.predict(X_test_scaled))
print(classification)
fig, ax = plt.subplots(figsize=(10, 10))
im = ax.imshow(confusion, cmap=plt.cm.Blues)
for i in range(len(confusion)):
    for j in range(len(confusion[i])):
        text = ax.text(j, i, str(confusion[i, j]),
                       ha="center", va="center", color="black")

plt.colorbar(im)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
Dt=DecisionTreeClassifier(criterion='entropy',max_depth=100,min_samples_leaf=9,min_samples_split=3).fit(X_train_scaled,y_train)
accu_dt=round(accuracy_score(y_test,Dt.predict(X_test_scaled)),3)
f1_score_dt=round(f1_score(y_test,Dt.predict(X_test_scaled),average='weighted'),3)
Precision_dt=round(precision_score(y_test,Dt.predict(X_test_scaled),average='weighted'),3)
Recall_dt=round(recall_score(y_test,Dt.predict(X_test_scaled),average='weighted'),3)
print('Accuracy:',accu_dt)
print('F1_Score:',f1_score_dt)
print('Recall_Score:',Precision_dt)
print('Precision_Score:',Recall_dt)
print('Cross Validation Score:',(np.mean(cross_val_score(Dt, X_train_scaled, y_train, cv = 6))))
classification=classification_report(
    digits=4,
    y_true=y_test,
    y_pred=Dt.predict(X_test_scaled))
print(classification)
fig, ax = plt.subplots(figsize=(10, 10))
im = ax.imshow(confusion, cmap=plt.cm.Blues)
for i in range(len(confusion)):
    for j in range(len(confusion[i])):
        text = ax.text(j, i, str(confusion[i, j]),
                       ha="center", va="center", color="black")

plt.colorbar(im)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
GB=GaussianNB().fit(X_train_scaled,y_train)
accu_gb=round(accuracy_score(y_test,GB.predict(X_test_scaled)),3)
f1_score_gb=round(f1_score(y_test,GB.predict(X_test_scaled),average='weighted'),3)
Precision_gb=round(precision_score(y_test,GB.predict(X_test_scaled),average='weighted'),3)
Recall_gb=round(recall_score(y_test,GB.predict(X_test_scaled),average='weighted'),3)
print('Accuracy:',accu_gb)
print('F1_Score:',f1_score_gb)
print('Recall_Score:',Precision_gb)
print('Precision_Score:',Recall_gb)
print('Cross Validation Score:',(np.mean(cross_val_score(GB, X_train_scaled, y_train, cv = 6))))
classification=classification_report(
    digits=4,
    y_true=y_test,
    y_pred=GB.predict(X_test_scaled))
print(classification)
fig, ax = plt.subplots(figsize=(10, 10))
im = ax.imshow(confusion, cmap=plt.cm.Blues)
for i in range(len(confusion)):
    for j in range(len(confusion[i])):
        text = ax.text(j, i, str(confusion[i, j]),
                       ha="center", va="center", color="black")

plt.colorbar(im)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
Knnr=KNeighborsClassifier(n_neighbors=19,weights='distance').fit(X_train_scaled,y_train)
accu_knn=round(accuracy_score(y_test,Knnr.predict(X_test_scaled)),3)
f1_score_knn=round(f1_score(y_test,Knnr.predict(X_test_scaled),average='weighted'),3)
Precision_knn=round(precision_score(y_test,Knnr.predict(X_test_scaled),average='weighted'),3)
Recall_knn=round(recall_score(y_test,Knnr.predict(X_test_scaled),average='weighted'),3)
print('Accuracy:',accu_knn)
print('F1_Score:',f1_score_knn)
print('Recall_Score:',Precision_knn)
print('Precision_Score:',Recall_knn)
print('Cross Validation Score:',(np.mean(cross_val_score(Knnr, X_train_scaled, y_train, cv = 6))))
classification=classification_report(
    digits=4,
    y_true=y_test,
    y_pred=Knnr.predict(X_test_scaled))
print(classification)
fig, ax = plt.subplots(figsize=(10, 10))
im = ax.imshow(confusion, cmap=plt.cm.Blues)
for i in range(len(confusion)):
    for j in range(len(confusion[i])):
        text = ax.text(j, i, str(confusion[i, j]),
                       ha="center", va="center", color="black")

plt.colorbar(im)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
Svmc=SVC(C=10,gamma=0.1,kernel='rbf').fit(X_train_scaled,y_train)
accu_svm=round(accuracy_score(y_test,Svmc.predict(X_test_scaled)),3)
f1_score_svm=round(f1_score(y_test,Svmc.predict(X_test_scaled),average='weighted'),3)
Precision_svm=round(precision_score(y_test,Svmc.predict(X_test_scaled),average='weighted'),3)
Recall_svm=round(recall_score(y_test,Svmc.predict(X_test_scaled),average='weighted'),3)
print('Accuracy:',accu_svm)
print('F1_Score:',f1_score_svm)
print('Recall_Score:',Precision_svm)
print('Precision_Score:',Recall_svm)
print('Cross Validation Score:',(np.mean(cross_val_score(Svmc, X_train_scaled, y_train, cv = 6))))
classification=classification_report(
    digits=4,
    y_true=y_test,
    y_pred=Svmc.predict(X_test_scaled))
print(classification)
fig, ax = plt.subplots(figsize=(10, 10))
im = ax.imshow(confusion, cmap=plt.cm.Blues)
for i in range(len(confusion)):
    for j in range(len(confusion[i])):
        text = ax.text(j, i, str(confusion[i, j]),
                       ha="center", va="center", color="black")

plt.colorbar(im)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
Result_2={
                  'Model':['Random-Forest','Decision-Tree','GaussianNB','Nearest-Neighbors','SVM'],
                  'Accuracy(%)':[accu_ran*100,accu_dt*100,accu_gb*100,accu_knn*100,accu_svm*100],
                  'F1_Score(%)':[f1_score_ran*100,f1_score_dt*100,f1_score_gb*100,f1_score_knn*100,f1_score_svm*100],
                  'Precision(%)':[Precision_ran*100,Precision_dt*100,Precision_gb*100,Precision_knn*100,Precision_svm*100],
                  'Recall(%)':[Recall_ran*100,Recall_dt*100,Recall_gb*100,Recall_knn*100,Recall_svm*100]

                    }
Result_Anxiety=pd.DataFrame(Result_2)

In [None]:
Result_Depression

In [None]:
Result_Stress

In [None]:
Result_Anxiety

In [None]:
splot=Result_Depression.plot(x='Model',y=['Accuracy(%)','F1_Score(%)','Precision(%)','Recall(%)'],kind='bar',figsize=(15,10),cmap='Spectral',width=0.9)
for p in splot.patches:
    splot.annotate(format(round(p.get_height()), '.0f'),
                   (p.get_x() + p.get_width() / 2., p.get_height()),
                   ha='center', va='center',
                   xytext=(0,7),
                   textcoords='offset points')
plt.title('Evaluation Scores For Depression Data for various Models')
plt.ylabel('Percentage')
plt.xlabel('Models')
plt.legend(loc='upper left')
for pos in ['right', 'top', 'bottom', 'left']:
    plt.gca().spines[pos].set_visible(False)
plt.show()

In [None]:
slot=Result_Stress.plot(x='Model',y=['Accuracy(%)','F1_Score(%)','Precision(%)','Recall(%)'],kind='bar',figsize=(15,10),width=0.9)
for p in slot.patches:
    slot.annotate(format(round(p.get_height()), '.0f'),
                   (p.get_x() + p.get_width() / 2., p.get_height()),
                   ha='center', va='center',
                   xytext=(0, 7),
                   textcoords='offset points')
plt.title('Evaluation Scores For Stress Data for various Models')
plt.ylabel('Percentage')
plt.xlabel('Models')
for pos in ['right', 'top', 'bottom', 'left']:
    plt.gca().spines[pos].set_visible(False)
plt.show()

In [None]:
slot=Result_Anxiety.plot(x='Model',y=['Accuracy(%)','F1_Score(%)','Precision(%)','Recall(%)'],kind='bar',figsize=(15,10),cmap='Accent',width=0.9)
for p in slot.patches:
    slot.annotate(format(round(p.get_height()), '.0f'),
                   (p.get_x() + p.get_width() / 2., p.get_height()),
                   ha='center', va='center',
                   xytext=(0, 7),
                   textcoords='offset points')
plt.title('Evaluation Scores For Anxiety Data for various Models')
plt.ylabel('Percentage')
plt.xlabel('Models')
for pos in ['right', 'top', 'bottom', 'left']:
    plt.gca().spines[pos].set_visible(False)
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

x = ['Random-Forest', 'Decision-Tree', 'GaussianNB', 'Nearest-Neighbors', 'SVM']
width = 0.35
X = np.arange(len(x))
plt.figure(figsize=(10, 10))

# Define different colors for Depression and Anxiety datasets
depression_color = 'r'
anxiety_color = 'b'

# Plot the accuracy scores for Depression and Anxiety with different colors
bar1 = plt.bar(X, Result_Depression['Accuracy(%)'], width, color=depression_color, label='Depression')
bar2 = plt.bar(X + width, Result_Anxiety['Accuracy(%)'], width, color=anxiety_color, label='Anxiety', alpha=0.7)

# Annotate the bars with the corresponding values
for bar in bar1 + bar2:
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 1, str(int(bar.get_height())),
             ha='center', color='m', fontsize=11)

plt.xticks(X + width / 2, x)
plt.legend(loc='upper left')
plt.title('Accuracy Scores for Different Models for the Depression and Anxiety Datasets')
plt.ylabel('Percentage', fontsize=17, color='red')
plt.xlabel('Models', fontsize=15, color='red')
for pos in ['right', 'top', 'bottom', 'left']:
    plt.gca().spines[pos].set_visible(False)
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

x = ['Random-Forest', 'Decision-Tree', 'GaussianNB', 'Nearest-Neighbors', 'SVM']
width = 0.25
X = np.arange(len(x))
plt.figure(figsize=(10, 10))

# Define different colors for Depression and Anxiety datasets
depression_color = 'mistyrose'
anxiety_color = 'plum'

# Plot the F1 Score for Depression and Anxiety with different colors
bar1 = plt.bar(X, Result_Depression['F1_Score(%)'], width, color=depression_color, label='Depression')
bar2 = plt.bar(X - width, Result_Anxiety['F1_Score(%)'], width, color=anxiety_color, label='Anxiety')

# Annotate the bars with the corresponding values
for bar in bar1 + bar2:
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 1, str(int(bar.get_height())),
             ha='center', color='crimson', fontsize=11)

plt.xticks(X, x)
plt.legend(loc='upper left')
plt.title('F1 Scores for Different Models for the Depression and Anxiety Datasets')
plt.ylabel('Percentage', fontsize=17, color='red')
plt.xlabel('Models', fontsize=15, color='red')
for pos in ['right', 'top', 'bottom', 'left']:
    plt.gca().spines[pos].set_visible(False)
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

x = ['Random-Forest', 'Decision-Tree', 'GaussianNB', 'Nearest-Neighbors', 'SVM']
width = 0.25
X = np.arange(len(x))
plt.figure(figsize=(10, 10))

# Define different colors for Depression and Anxiety datasets
depression_color = 'black'
anxiety_color = 'pink'

# Plot the Precision Score for Depression and Anxiety with different colors
bar1 = plt.bar(X, Result_Depression['Precision(%)'], width, color=depression_color, label='Depression')
bar2 = plt.bar(X - width, Result_Anxiety['Precision(%)'], width, color=anxiety_color, label='Anxiety')

# Annotate the bars with the corresponding values
for bar in bar1 + bar2:
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 1, str(int(bar.get_height())),
             ha='center', color='gray', fontsize=11)

plt.xticks(X, x)
plt.legend(loc='upper left')
plt.title('Precision Scores for Different Models for the Depression and Anxiety Datasets')
plt.ylabel('Percentage', fontsize=17, color='red')
plt.xlabel('Models', fontsize=15, color='red')
for pos in ['right', 'top', 'bottom', 'left']:
    plt.gca().spines[pos].set_visible(False)
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

x = ['Random-Forest', 'Decision-Tree', 'GaussianNB', 'Nearest-Neighbors', 'SVM']
width = 0.25
X = np.arange(len(x))
plt.figure(figsize=(10, 10))

# Define different colors for Depression and Anxiety datasets
depression_color = 'indigo'
anxiety_color = 'violet'

# Plot the Recall Score for Depression and Anxiety with different colors
bar1 = plt.bar(X, Result_Depression['Recall(%)'], width, color=depression_color, label='Depression')
bar2 = plt.bar(X - width, Result_Anxiety['Recall(%)'], width, color=anxiety_color, label='Anxiety')

# Annotate the bars with the corresponding values
for bar in bar1 + bar2:
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 1, str(int(bar.get_height())),
             ha='center', color='darkorange', fontsize=11)

plt.xticks(X, x)
plt.legend(loc='upper left')
plt.title('Recall Scores for Different Models for the Depression and Anxiety Datasets')
plt.ylabel('Percentage', fontsize=17, color='red')
plt.xlabel('Models', fontsize=15, color='red')
for pos in ['right', 'top', 'bottom', 'left']:
    plt.gca().spines[pos].set_visible(False)
plt.show()