In [5]:
import tensorflow as tf

In [6]:
def get_lines(filepath):
    with open(filepath, 'r') as f:
        return f.readlines()


import csv

def preprocess_text_with_line_numbers(filename):
    abstract_samples = []  # List to store preprocessed abstract samples

    with open(filename, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            line_data = {}  # Dictionary to store information about the line
            line_data["target"] = int(row["is_depression"])  # Convert target to integer (0 or 1)
            line_data["text"] = row["clean_text"].lower()  # Store text (converted to lowercase)
            abstract_samples.append(line_data)  # Append the line data to the list of abstract samples

    # Add line numbers and total lines for each sample
    for abstract_sample in abstract_samples:
        abstract_sample["line_number"] = abstract_samples.index(abstract_sample)
        abstract_sample["total_lines"] = len(abstract_samples)

    return abstract_samples



def split_chars(text):
    return " ".join(list(text))


In [7]:
import os

data_file = '/content/depression_dataset_reddit_cleaned.csv'

if os.path.exists(data_file):
    filenames = [data_file]
    print(filenames)

['/content/depression_dataset_reddit_cleaned.csv']


In [8]:
get_lines(filenames[0])[:20]

['clean_text,is_depression\n',
 'we understand that most people who reply immediately to an op with an invitation to talk privately mean only to help but this type of response usually lead to either disappointment or disaster it usually work out quite differently here than when you say pm me anytime in a casual social context we have huge admiration and appreciation for the goodwill and good citizenship of so many of you who support others here and flag inappropriate content even more so because we know that so many of you are struggling yourselves we re hard at work behind the scene on more information and resource to make it easier to give and get quality help here this is just a small start our new wiki page explains in detail why it s much better to respond in public comment at least until you ve gotten to know someone it will be maintained at r depression wiki private contact and the full text of the current version is below summary anyone who while acting a a helper invite or acc

In [9]:

all_samples = preprocess_text_with_line_numbers(filenames[0])

import pandas as pd
from sklearn.model_selection import train_test_split

# Split the data into train, validation, and test sets
train_samples, test_samples = train_test_split(all_samples, test_size=0.2, random_state=42)







In [10]:
# Convert each set into DataFrames
train_df = pd.DataFrame(train_samples)
test_df = pd.DataFrame(test_samples)

# Display a preview of the train DataFrame
train_df.head(10)

Unnamed: 0,target,text,line_number,total_lines
0,1,my throat is always closed up and today it fee...,1226,7731
1,1,just realized that urdu word for depression is...,3771,7731
2,1,yay it s time to fuck depressed depressed adje...,2934,7731
3,1,i need someone to calm me down i have a stutte...,1512,7731
4,1,i have been suffering from an eating disorder ...,1882,7731
5,0,bout to take my dog for a walk kinda tired asw...,7711,7731
6,1,i never had illusion of grandeur growing up i ...,303,7731
7,0,kwesidei not the whole crew,3836,7731
8,0,boagworld the profile picture make a happy ret...,6793,7731
9,1,i got a new job two week ago it s going amazin...,1656,7731


In [11]:
train_df_shuffled=train_df.sample(frac=1, random_state = 42)
train_df_shuffled.head()

Unnamed: 0,target,text,line_number,total_lines
5485,0,khqrightnow i heard them making announcement t...,5073,7731
4094,1,welcome to r depression s check in post a plac...,1,7731
5180,1,anapata depression juu ya nike sneaker,2832,7731
5683,1,i m only year ago i wa really one of the famou...,2368,7731
4753,0,is back in byron bay cafe fresh missing all my...,6990,7731


In [12]:
import random
random_index = random.randint(0,len(train_df)-1)
for row in train_df_shuffled[["text", 'target']][random_index: random_index+5].itertuples():
  index,text,target = row
  print(f"Target: {target}; {'is depression' if target!=0 else 'not depression'}")
  print("Text:",text,end='\n')
  print("--"*20)

Target: 0; not depression
Text: just had a tonne of sad news today
----------------------------------------
Target: 0; not depression
Text: weebeedee run wa great thanks is very windy today so bike ride not an option this morning
----------------------------------------
Target: 0; not depression
Text: making a short video window movie maker ha crashed for 0th time but my middle name is perseverence p should finish the video soon
----------------------------------------
Target: 1; is depression
Text: tired of detail just going to get to the point i have attempted before and lately i ve been feeling like attempting again might try to think of a plan or talk myself out of it idk yet we ll see how life treat me the next few day
----------------------------------------
Target: 1; is depression
Text: making plan jotting them down laugh emoji response my friend don t know my sentence end in an unspoken if by then i m still alive tightness in the chest with each breath eating and hating myself

In [13]:
from sklearn.model_selection import train_test_split
train_sentences,val_sentences,train_labels,val_labels=train_test_split(train_df_shuffled['text'].to_numpy(),
                                                                       train_df_shuffled['target'].to_numpy(),
                                                                       test_size=0.7,
                                                                       random_state=42)

In [14]:
max_vocab_length = 10000
max_length = round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))
max_length

73

In [15]:
#convert text to numbers
text_vectorizer = tf.keras.layers.TextVectorization(max_tokens=max_vocab_length,
                                                    output_sequence_length=max_length)

In [16]:
text_vectorizer.adapt(train_sentences) #done for  building a vocabulary, handling OOV words, ensuring consistency, and optimizing processing efficiency.

In [17]:
words=text_vectorizer.get_vocabulary()
words[:5]

['', '[UNK]', 'i', 'to', 'and']

In [18]:
#embedding
embedding=tf.keras.layers.Embedding(input_dim=max_vocab_length,
                                    output_dim=128,
                                    input_length=max_length)

In [19]:
import tensorflow_hub as hub     #USE

embed = hub.load("https://www.kaggle.com/models/google/universal-sentence-encoder/frameworks/TensorFlow2/variations/universal-sentence-encoder/versions/2")
embeddings = embed([
    "The quick brown fox jumps over the lazy dog.",
    "I am a sentence for which I would like to get its embedding"])

print(embeddings)

tf.Tensor(
[[-0.03133019 -0.06338634 -0.01607501 ... -0.03242779 -0.0457574
   0.05370456]
 [ 0.0508086  -0.01652431  0.01573776 ...  0.00976659  0.03170118
   0.01788118]], shape=(2, 512), dtype=float32)


In [20]:
sentence_encoder=hub.KerasLayer(embed,input_shape=[],dtype=tf.string,trainable=False)

In [21]:
model=tf.keras.models.Sequential([
    sentence_encoder,
    tf.keras.layers.Dense(64),
    tf.keras.layers.Dense(32),
    tf.keras.layers.Dense(1,activation='sigmoid')

])

model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
                optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                metrics=['accuracy'])
history=model.fit(train_sentences, train_labels, epochs=5,validation_data=(val_sentences,val_labels))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [22]:
model.predict(['i feel sad and want to kill myself'])



array([[0.9996624]], dtype=float32)

In [23]:
model.predict(['i feel a bit off so ill go take a jolly walk'])



array([[3.542017e-05]], dtype=float32)

In [27]:
model.predict(['i feel like im a burden to everyone around me'])



array([[0.9680723]], dtype=float32)

In [25]:
model.predict(['im so happy with the gift i got i could cry'])



array([[4.748793e-05]], dtype=float32)

In [26]:
# Reshape test data to match expected input shape
test_text_data = test_df['text'].values.reshape(-1)

# Evaluate the model on test data
test_loss, test_accuracy = model.evaluate(test_text_data, test_df['target'])

print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")


Test Loss: 0.09844008833169937
Test Accuracy: 0.9689722061157227
