# Twitter Depression Prediction System

## Loading Libraries

In [6]:
!pip install pandas
!pip install numpy
!pip install spacy

Collecting pandas
  Using cached pandas-2.2.2-cp311-cp311-win_amd64.whl.metadata (19 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2024.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pandas-2.2.2-cp311-cp311-win_amd64.whl (11.6 MB)
Using cached pytz-2024.1-py2.py3-none-any.whl (505 kB)
Using cached tzdata-2024.1-py2.py3-none-any.whl (345 kB)
Installing collected packages: pytz, tzdata, pandas
Successfully installed pandas-2.2.2 pytz-2024.1 tzdata-2024.1
Collecting spacy
  Using cached spacy-3.7.4-cp311-cp311-win_amd64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Using cached spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Using cached spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Using cached murmurhash-1.

In [18]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 435.7 kB/s eta 0:00:30
      --------------------------------------- 0.2/12.8 MB 1.7 MB/s eta 0:00:08
     - -------------------------------------- 0.6/12.8 MB 3.9 MB/s eta 0:00:04
     -- ------------------------------------- 0.9/12.8 MB 4.7 MB/s eta 0:00:03
     --- ------------------------------------ 1.1/12.8 MB 4.6 MB/s eta 0:00:03
     --- ------------------------------------ 1.1/12.8 MB 4.4 MB/s eta 0:00:03
     --- ------------------------------------ 1.1/12.8 MB 4.4 MB/s eta 0:00:03
     ---- ----------------------------------- 1.4/12.8 MB 3.6 MB/s eta 0:00:04
     ----- ---------------------------------- 1.8/12.8 MB 4.2 MB/s eta 0:00:03
     ------ ----------------------------

In [34]:
# !pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.4.2-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Using cached scipy-1.13.0-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.4.0-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Using cached threadpoolctl-3.4.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.4.2-cp311-cp311-win_amd64.whl (10.6 MB)
Using cached joblib-1.4.0-py3-none-any.whl (301 kB)
Using cached scipy-1.13.0-cp311-cp311-win_amd64.whl (46.2 MB)
Using cached threadpoolctl-3.4.0-py3-none-any.whl (17 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.4.0 scikit-learn-1.4.2 scipy-1.13.0 threadpoolctl-3.4.0


In [118]:
import numpy as np
import pandas as pd
import spacy 
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,accuracy_score, confusion_matrix 
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
nlp = spacy.load("en_core_web_sm")

## Importing Dataset

In [119]:
# df = pd.read_csv("../dataset/new_mental_health_dataset.csv")
# df.head(2)

In [120]:
# df.drop(["Unnamed: 0", "post_id", "post_created", "user_id", "followers" ,"friends", "favourites" , "statuses", "retweets" ], axis = 1, inplace=True)

In [121]:
# df.to_csv("../dataset/new_mental_health_dataset.csv", index=False)

### Preprocessing the text

In [122]:
def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
    return "".join(filtered_tokens)

In [22]:
# df['processed_post_text'] = df['post_text'].apply(preprocess)

In [123]:
# df.drop("post_text", axis=1, inplace=True)

In [124]:
# df.head()

In [125]:
# df.to_csv("../dataset/preprocess_dataset.csv", index=False)

In [126]:
data = pd.read_csv("../dataset/preprocess_dataset.csv")
data.head()

Unnamed: 0,label,processed_post_text
0,1,2yeardiagnoseanxietydepressiontodaytakemomentr...
1,1,Sundayneedbreakplanspendlittletimepossiblea14
2,1,awaketiredneedsleepbrainidea
3,1,RT@sewhqretrobearperfectgiftgreatbeginnerstitc...
4,1,hardpackinglistmakelifeeasierreinforceneedmovi...


In [127]:
data['processed_post_text'].isna().sum()

14

In [128]:
data.dropna(inplace=True)


In [129]:
data['processed_post_text'].isna().sum()

0

## Splitting into dependent and independent set

In [130]:
X = data["processed_post_text"]
y = data["label"]

In [131]:
X.shape, y.shape

((19986,), (19986,))

## Splitting into training and test set

In [132]:
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.33, random_state=42, stratify=data.label)

In [133]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),  # TF-IDF vectorization
    ('clf', MultinomialNB())       # Naive Bayes classifier
])


In [134]:
pipeline.fit(X_train,y_train)

In [136]:
y_pred = pipeline.predict(X_test)

In [137]:

print(classification_report(y_pred,y_test))
print(confusion_matrix(y_pred,y_test))
print(accuracy_score(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.93      0.60      0.73      5118
           1       0.38      0.84      0.52      1478

    accuracy                           0.65      6596
   macro avg       0.65      0.72      0.62      6596
weighted avg       0.81      0.65      0.68      6596

[[3066 2052]
 [ 233 1245]]
0.6535779260157671


In [138]:
import joblib
joblib.dump(pipeline, '../models/pipeline.pkl')

['../models/pipeline.pkl']

# END