In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [10]:
import pandas as pd
import numpy as np
import nltk
import spacy
from textblob import TextBlob
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from gensim import corpora
from gensim.models import LdaModel
from nltk.corpus import stopwords
import re

# Load the dataset
url = "/kaggle/input/mental-health-social-media/Mental-Health-Twitter.csv"
df = pd.read_csv(url)  # Adjust this as per the correct file access

# Inspect the dataset
df.head()





Unnamed: 0.1,Unnamed: 0,post_id,post_created,post_text,user_id,followers,friends,favourites,statuses,retweets,label
0,0,637894677824413696,Sun Aug 30 07:48:37 +0000 2015,It's just over 2 years since I was diagnosed w...,1013187241,84,211,251,837,0,1
1,1,637890384576778240,Sun Aug 30 07:31:33 +0000 2015,"It's Sunday, I need a break, so I'm planning t...",1013187241,84,211,251,837,1,1
2,2,637749345908051968,Sat Aug 29 22:11:07 +0000 2015,Awake but tired. I need to sleep but my brain ...,1013187241,84,211,251,837,0,1
3,3,637696421077123073,Sat Aug 29 18:40:49 +0000 2015,RT @SewHQ: #Retro bears make perfect gifts and...,1013187241,84,211,251,837,2,1
4,4,637696327485366272,Sat Aug 29 18:40:26 +0000 2015,It’s hard to say whether packing lists are mak...,1013187241,84,211,251,837,1,1


In [11]:
# Initialize stopwords and spaCy model
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
nlp = spacy.load('en_core_web_sm')

# Define preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    doc = nlp(' '.join(tokens))
    lemmatized = [token.lemma_ for token in doc]
    return ' '.join(lemmatized)

# Apply preprocessing
df['cleaned_text'] = df['post_text'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
# Apply sentiment analysis using TextBlob
df['textblob_polarity'] = df['cleaned_text'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Classify polarity into positive, negative, and neutral
def classify_sentiment(polarity):
    if polarity > 0:
        return 'positive'
    elif polarity < 0:
        return 'negative'
    else:
        return 'neutral'

df['textblob_sentiment'] = df['textblob_polarity'].apply(classify_sentiment)

# Check results
df[['cleaned_text', 'textblob_sentiment']].head()


Unnamed: 0,cleaned_text,textblob_sentiment
0,2 year since diagnose anxiety depression today...,positive
1,sunday need break I m plan spend little time p...,negative
2,awake tired need sleep brain idea,negative
3,rt sewhq retro bear make perfect gift great be...,positive
4,hard say whether pack list make life easy rein...,positive


In [18]:
# Label encoding for machine learning (use manually labeled data or rule-based labels for supervised learning)
df['label'] = df['textblob_sentiment'].map({'positive': 1, 'negative': -1, 'neutral': 0})

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=3000)
X = tfidf.fit_transform(df['cleaned_text']).toarray()
y = df['label']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Logistic Regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))


Accuracy: 0.8776666666666667
              precision    recall  f1-score   support

          -1       0.92      0.67      0.77      1099
           0       0.84      0.99      0.91      2943
           1       0.94      0.83      0.88      1958

    accuracy                           0.88      6000
   macro avg       0.90      0.83      0.85      6000
weighted avg       0.89      0.88      0.87      6000



In [19]:
# Prepare data for LDA
texts = [text.split() for text in df['cleaned_text']]

# Create a dictionary and corpus
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# Train LDA Model
lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=10)

# Print the topics
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)


(0, '0.010*"depression" + 0.004*"buy" + 0.004*"all" + 0.004*"president" + 0.004*"treatment"')
(1, '0.032*"not" + 0.020*"do" + 0.013*"I" + 0.013*"get" + 0.012*"like"')
(2, '0.022*"s" + 0.020*"rt" + 0.014*"m" + 0.013*"I" + 0.011*"hello"')
(3, '0.052*"rt" + 0.016*"yong" + 0.011*"_" + 0.009*"fuck" + 0.006*"foryong"')
(4, '0.029*"thank" + 0.025*"follow" + 0.017*"say" + 0.015*"twitter" + 0.014*"rt"')


In [20]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from textblob import TextBlob

# Function to classify sentiment using TextBlob
def classify_textblob_sentiment(text):
    polarity = TextBlob(text).sentiment.polarity
    if polarity > 0:
        return 1  # Positive sentiment
    elif polarity < 0:
        return -1  # Negative sentiment
    else:
        return 0  # Neutral sentiment

# Apply the classification to the dataset
df['textblob_pred'] = df['cleaned_text'].apply(classify_textblob_sentiment)

# Get actual labels and predicted labels
y_true = df['label']  # Assuming 'label' contains the actual sentiment labels
y_pred_textblob = df['textblob_pred']

# Calculate TextBlob performance metrics
accuracy_textblob = accuracy_score(y_true, y_pred_textblob)
precision_textblob, recall_textblob, f1_textblob, _ = precision_recall_fscore_support(y_true, y_pred_textblob, average='weighted')

# Print performance metrics
print(f"TextBlob - Accuracy: {accuracy_textblob:.2f}, Precision: {precision_textblob:.2f}, Recall: {recall_textblob:.2f}, F1 Score: {f1_textblob:.2f}")


TextBlob - Accuracy: 1.00, Precision: 1.00, Recall: 1.00, F1 Score: 1.00


In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=3000)
X = tfidf.fit_transform(df['cleaned_text']).toarray()
y = df['sentiment_label']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train Logistic Regression Classifier
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Predict on the test set
y_pred_lr = clf.predict(X_test)

# Calculate Logistic Regression performance metrics
accuracy_lr = accuracy_score(y_test, y_pred_lr)
precision_lr, recall_lr, f1_lr, _ = precision_recall_fscore_support(y_test, y_pred_lr, average='weighted')

print(f"Logistic Regression - Accuracy: {accuracy_lr:.2f}, Precision: {precision_lr:.2f}, Recall: {recall_lr:.2f}, F1 Score: {f1_lr:.2f}")


Logistic Regression - Accuracy: 0.88, Precision: 0.89, Recall: 0.88, F1 Score: 0.87
