In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import zipfile
zf = zipfile.ZipFile('/content/drive/MyDrive/COMP723Data/Depression.zip')
data = pd.read_csv(zf.open('depression_dataset_reddit_cleaned.csv'))

In [None]:
# Counting the number of missing values in each column of the 'depression' dataset.
data.isnull().sum()

clean_text       0
is_depression    0
dtype: int64

In [None]:
data.shape

(7731, 2)

In [None]:
# Displaying the first 5 rows of the 'depression' dataset.
data.head()

Unnamed: 0,clean_text,is_depression
0,we understand that most people who reply immed...,1
1,welcome to r depression s check in post a plac...,1
2,anyone else instead of sleeping more when depr...,1
3,i ve kind of stuffed around a lot in my life d...,1
4,sleep is my greatest and most comforting escap...,1


In [None]:
from sklearn.model_selection import train_test_split # This function allows the splitting of a dataset into training and testing sets.

X = data["clean_text"] # Selecting the 'clean_text' column from the 'data' dataset as the features (X)
y = data["is_depression"] # Selecting the 'is_depression' column from the 'data' dataset as the labels (y)

# Spliting the features and labels into training and testing sets.
# The training set will contain 80% of the data and the test set will contain the remaining 20%
# The random state is set to 42 to ensure the splits are reproducible
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Creating a token frequency counter
counter = CountVectorizer()

# Transforming the training data into a matrix of token counts
X_train_matrix = counter.fit_transform(X_train)

# Transforming the test data into a matrix of token counts
X_test_matrix = counter.transform(X_test)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Training the Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train_matrix, y_train)

# Making predictions
y_pred = clf.predict(X_test_matrix)

# Evaluating the model
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.8067226890756303


In [None]:
from sklearn.neural_network import MLPClassifier

# Training the Neural Network (MLP) classifier
clf = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
clf.fit(X_train_matrix, y_train)

# Making predictions
y_pred = clf.predict(X_test_matrix)

# Evaluating the model
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.9153199741435035




In [None]:
from sklearn.tree import DecisionTreeClassifier

# Defining the Decision Tree model
model = DecisionTreeClassifier()

# Training the model
model.fit(X_train_matrix, y_train)

# Making predictions
y_pred = model.predict(X_test_matrix)

# Evaluating the model
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.929541047188106


In [None]:
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.92      0.91       783
           1       0.92      0.90      0.91       764

    accuracy                           0.91      1547
   macro avg       0.91      0.91      0.91      1547
weighted avg       0.91      0.91      0.91      1547



In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB

param_grid = {
    'alpha': [0.1, 0.5, 1, 1.5, 2]
}

model = MultinomialNB()
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='accuracy')

grid_search.fit(X_train_matrix, y_train)

print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

Best parameters:  {'alpha': 0.1}
Best score:  0.9084738150490228


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier

param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (200,)],
    'activation': ['relu', 'tanh', 'logistic'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.001, 0.01, 0.1],
    'learning_rate_init': [0.001, 0.01, 0.1]
}

model = MLPClassifier(max_iter=1000, random_state=42)
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='accuracy')

grid_search.fit(X_train_matrix, y_train)

print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

param_grid = {
    'max_depth': [3, 5, 7, 9],
    'min_samples_split': [2, 5, 10, 20]
}

model = DecisionTreeClassifier()
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='accuracy')

grid_search.fit(X_train_matrix, y_train)

print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

Best parameters:  {'max_depth': 9, 'min_samples_split': 5}
Best score:  0.9416234369982587


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

chosen_category = 1

# Calculating precision
precision = precision_score(y_test, y_pred, pos_label=chosen_category)

# Calculating recall
recall = recall_score(y_test, y_pred, pos_label=chosen_category)

# Calculating F1 score
f1 = f1_score(y_test, y_pred, pos_label=chosen_category)

print(f"Precision for {chosen_category}: {precision}")
print(f"Recall for {chosen_category}: {recall}")
print(f"F1 score for {chosen_category}: {f1}")

Precision for 1: 0.9395973154362416
Recall for 1: 0.9162303664921466
F1 score for 1: 0.9277667329357191


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

import nltk
from nltk.corpus import stopwords

# Defining the stop words
stop_words = 'english'  # Use the string 'english'

# Creating a token frequency counter that doesn't count stop words
counter = CountVectorizer(stop_words=stop_words)

# Transforming the training data into a matrix of token counts
X_train_matrix = counter.fit_transform(X_train)

# Transforming the test data into a matrix of token counts
X_test_matrix = counter.transform(X_test)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Pre-processing using CountVectoriser
vectorizer = CountVectorizer(
    lowercase=True,           # Converting text to lowercase
    stop_words='english',     # Removing common English stopwords
    max_features=5000         # Limiting the number of features
)

# Vectorising the data
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Training the Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train_vectorized, y_train)

# Making predictions
y_pred = clf.predict(X_test_vectorized)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8972204266321914


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Pre-processing using CountVectoriser
vectorizer = CountVectorizer(
    lowercase=True,           # Converting text to lowercase
    stop_words='english',     # Removing common English stopwords
    max_features=5000         # Limiting the number of features
)

# Vectoriseing the data
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Training Decision Tree classifier
clf = DecisionTreeClassifier()
clf.fit(X_train_vectorized, y_train)

# Making predictions
y_pred = clf.predict(X_test_vectorized)

# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9114414996767938


In [None]:
data.shape

(7731, 2)