In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re


import nltk
nltk.download('stopwords')
nltk.download('punkt')

# Function for data preprocessing
def preprocess_text(text):

    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'@\w+', '', text)
    return text

# Function for removing stopwords and tokenization
def remove_stopwords_and_tokenize(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
    return ' '.join(filtered_text)

# Load your dataset
df = pd.read_csv('your_dataset.csv')

# Preprocess the text data
df['text'] = df['text'].apply(preprocess_text)
df['text'] = df['text'].apply(remove_stopwords_and_tokenize)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = nb_classifier.predict(X_test_tfidf)

# Evaluate the classifier
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Example of predicting depression level
user_posts = Read post to analyze
user_posts = [preprocess_text(post) for post in user_posts]
user_posts = [remove_stopwords_and_tokenize(post) for post in user_posts]

user_posts_tfidf = tfidf_vectorizer.transform(user_posts)
user_sentiments = nb_classifier.predict(user_posts_tfidf)

# Calculate depression level based on sentiments
negative_percentage = (user_sentiments == 'negative').mean() * 100

# Determine depression level
if 1 <= negative_percentage <= 25:
    depression_level = "Considered normal"
elif 26 <= negative_percentage <= 40:
    depression_level = "Mild depression"
elif 41 <= negative_percentage <= 55:
    depression_level = "Borderline depression"
elif 56 <= negative_percentage <= 70:
    depression_level = "Moderate depression"
elif 71 <= negative_percentage <= 85:
    depression_level = "Severe depression"
else:
    depression_level = "Extreme depression"

print("Depression Level:", depression_level)
