In [None]:
folder_path = '/content/drive/MyDrive/College/NLP'

# Approaches followed

## Feature Extractions

- Unigram
- Bigrams
- Bigrams + LIWC(Empath Library) [Final]

## Models Used

- MLP
- SVM
- LSTM

In [None]:
import os
import glob

folders = glob.glob(os.path.join(folder_path, '*'))

In [None]:
folders

['/content/drive/MyDrive/College/NLP/mixed_depression',
 '/content/drive/MyDrive/College/NLP/blogs_non_depression',
 '/content/drive/MyDrive/College/NLP/blogs_depression',
 '/content/drive/MyDrive/College/NLP/mixed_non_depression',
 '/content/drive/MyDrive/College/NLP/reddit_non_depression',
 '/content/drive/MyDrive/College/NLP/reddit_depression']

In [None]:
X_train, y_train = [],[]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range=(2,2),max_features=100)

In [None]:
import re

def preprocess(text):
  text = re.sub(r'\n+', ' ', text)
  text = re.sub(r'\.{2,}', ' ', text)

  # Remove other punctuation and non-alphanumeric characters
  text = re.sub(r'[^A-Za-z0-9\s]+', '', text)

  # Remove leading and trailing spaces
  text = text.strip()

  text = re.sub(r'\s+', ' ', text)

  return text

In [None]:
corpus = []

for folder in folders:
  files =  glob.glob(os.path.join(folder, '*'))
  for file_path in files:
      try:
          with open(file_path, 'r', encoding='utf-8') as file:

              # Read the file contents, preprocess it by removing stopwords and only having lemmatized words

              corpus.append(preprocess("\n".join(file.readlines())))

              # Add the TF-IDF for each document in X_train and their respective label in y_train

              if "non_depression" in folder:
                y_train.append("N")
              else:
                y_train.append("Y")

      except FileNotFoundError:
          print(f"The file '{file_path}' was not found.")
      except Exception as e:
          print(f"An error occurred while reading {file_path}: {str(e)}")

  print("done")

An error occurred while reading /content/drive/MyDrive/College/NLP/mixed_depression/greg_e_1_b_1_can.txt: 'utf-8' codec can't decode byte 0x96 in position 153: invalid start byte
An error occurred while reading /content/drive/MyDrive/College/NLP/mixed_depression/beenbroken_e_1_b_1_UK.txt: 'utf-8' codec can't decode byte 0x85 in position 198: invalid start byte
An error occurred while reading /content/drive/MyDrive/College/NLP/mixed_depression/justin_e_1_b_1_us.txt: 'utf-8' codec can't decode byte 0x85 in position 808: invalid start byte
An error occurred while reading /content/drive/MyDrive/College/NLP/mixed_depression/blugh_e_2_b_1_xx.txt: 'utf-8' codec can't decode byte 0xd5 in position 1776: invalid continuation byte
An error occurred while reading /content/drive/MyDrive/College/NLP/mixed_depression/krwingwthyou_e_2_a_1_us.txt: 'utf-8' codec can't decode byte 0xe9 in position 121: invalid continuation byte
An error occurred while reading /content/drive/MyDrive/College/NLP/mixed_depr

In [None]:
corpus

In [None]:
!pip install empath

Collecting empath
  Downloading empath-0.89.tar.gz (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: empath
  Building wheel for empath (setup.py) ... [?25l[?25hdone
  Created wheel for empath: filename=empath-0.89-py3-none-any.whl size=57801 sha256=c0d488efe8308c80b7d503fafa62aa6a23682385588e194b4b814513896fb3d2
  Stored in directory: /root/.cache/pip/wheels/92/b3/83/9eb2c6199881e2385a59d99bd911363475060ebeb4bdb27242
Successfully built empath
Installing collected packages: empath
Successfully installed empath-0.89


In [None]:
# LIWC method for analyzing the number of words in each concept related to depression vs non depression
from empath import Empath
lexicon = Empath()

In [None]:
# Defining categories important related to depression vs non depression
categories = ["help", "violence", "valuable", "medical_emergency", "cold", "hate", "cheerfulness", "aggression", "envy", "crime", "dispute", "nervousness", "weakness", "swearing_terms", "suffering", "irritability",
              "confusion", "death", "healing", "celebration", "neglect", "love", "sympathy", "trust", "deception", "fight", "disgust", "injury", "optimism", "warmth", "sadness", "emotional",
              "joy", "affection", "shame", "torment", "breaking", "ugliness", "pain", "negative_emotion", "alcohol", "disappointment", "contentment", "positive_emotion"]

In [None]:
import numpy as np

# Combining both TF-IDF of top 100 features and LIWC for the related categories of depression

X_train = np.concatenate((tfidf_vectorizer.fit_transform(corpus).toarray(), [list(lexicon.analyze(x).values()) for x in corpus]),axis=1)

In [None]:
X_train.shape, len(y_train)

((4864, 294), 4864)

In [None]:
tfidf_vectorizer.get_feature_names_out()

array(['able to', 'about it', 'all of', 'all the', 'and dont', 'and have',
       'and im', 'and it', 'and just', 'and my', 'and she', 'and that',
       'and the', 'and then', 'at the', 'back to', 'because of', 'but im',
       'but it', 'dont have', 'dont know', 'dont want', 'feel like',
       'for me', 'for the', 'go to', 'going to', 'had to', 'have been',
       'have no', 'have to', 'he was', 'how to', 'if you', 'im not',
       'in my', 'in the', 'is the', 'it is', 'it was', 'ive been',
       'kind of', 'know how', 'know what', 'lot of', 'me and', 'me to',
       'my family', 'my friends', 'my life', 'my mom', 'my parents',
       'need to', 'of my', 'of the', 'on my', 'on the', 'one of',
       'out of', 'right now', 'she is', 'she was', 'so much', 'talk to',
       'that im', 'that she', 'that was', 'the first', 'the last',
       'the only', 'the same', 'the time', 'this is', 'to be', 'to do',
       'to get', 'to go', 'to have', 'to her', 'to make', 'to me',
       'to my',

In [None]:
from sklearn.model_selection import train_test_split

# split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    X_train, y_train, test_size=0.3, random_state=0)

# MLP Classifier

In [None]:
from sklearn.neural_network import MLPClassifier

mlp_classifier = MLPClassifier(hidden_layer_sizes=(64,32))
model = mlp_classifier.fit(X_train, y_train)

In [None]:
predictions = model.predict(X_test)

In [None]:
predictions

array(['N', 'Y', 'N', ..., 'N', 'Y', 'N'], dtype='<U1')

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(predictions, y_test)

0.9212328767123288

In [None]:
  from sklearn.model_selection import GridSearchCV

param_grid = {
    'hidden_layer_sizes': [(100,), (50, 50), (100, 50)],
    'activation': ['relu', 'tanh'],
    'max_iter': [500, 1000],
}

grid_search = GridSearchCV(mlp_classifier, param_grid, cv=3, verbose=2, n_jobs=-1)

grid_search.fit(X_train, y_train)

best_estimator = grid_search.best_estimator_

y_pred = best_estimator.predict(X_test)

accuracy_score(y_pred, y_test)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


0.923972602739726

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_pred,y_test, labels = ['Y','N']))

              precision    recall  f1-score   support

           Y       0.93      0.92      0.92       715
           N       0.92      0.93      0.93       745

    accuracy                           0.92      1460
   macro avg       0.92      0.92      0.92      1460
weighted avg       0.92      0.92      0.92      1460



# SVM Classifier

In [None]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

svm_classifier = make_pipeline(StandardScaler(), SVC(gamma='auto'))

svm_classifier.fit(X_train, y_train)

In [None]:
svm_predictions = svm_classifier.predict(X_test)

print(classification_report(y_test,svm_predictions))

accuracy_score(y_test,svm_predictions)

              precision    recall  f1-score   support

           N       0.93      0.97      0.95       750
           Y       0.97      0.92      0.95       710

    accuracy                           0.95      1460
   macro avg       0.95      0.95      0.95      1460
weighted avg       0.95      0.95      0.95      1460



0.947945205479452

In [None]:
# Do LIWC Only and Unigram and reemoving stopwords