In [None]:
!pip install scikit-learn neattext

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn import metrics
import joblib
import neattext.functions as nfx
import re


def load_csv_with_fallback(filename):
    try:
        return pd.read_csv(filename, encoding='utf-8')
    except UnicodeDecodeError:
        return pd.read_csv(filename, encoding='ISO-8859-1')

# Load datasets
mental_tweet_df = pd.read_csv('mental_tweet.csv')
conversation_df = load_csv_with_fallback('Merged_Conversation.csv')

# Clean the text
def clean_text(text):
    text = nfx.remove_userhandles(text)  # Remove user handles
    text = nfx.remove_stopwords(text)    # Remove stop words
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    return text

# Apply cleaning function to the text columns
mental_tweet_df['Clean_Text'] = mental_tweet_df['Text'].apply(clean_text)
conversation_df['Clean_Text'] = conversation_df['Questions'].apply(clean_text)

# Label data as 'neutral' or any appropriate placeholder
conversation_df['Emotion'] = 'neutral'

# Combine all data
combined_df = pd.concat([
    mental_tweet_df[['Clean_Text', 'Emotion']],
    conversation_df[['Clean_Text', 'Emotion']]
])

# Check for and handle missing values
combined_df.dropna(subset=['Clean_Text', 'Emotion'], inplace=True)

# Print original data distribution
print("Original Class Distribution:\n", combined_df['Emotion'].value_counts())

# Remove minority labels below a specified threshold count
threshold_count = 1000  # Define the minimum number of samples required for a class to be retained
filtered_df = combined_df[combined_df['Emotion'].map(combined_df['Emotion'].value_counts()) >= threshold_count]

# Print data distribution after removing minority labels
print("Filtered Class Distribution:\n", filtered_df['Emotion'].value_counts())

# Split the data into features and labels
X = filtered_df['Clean_Text']
y = filtered_df['Emotion']

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define pipelines with different models
pipelines = {
    'SVM': Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english', max_features=10000, ngram_range=(1, 2))),
        ('clf', SVC(kernel='linear', C=1, probability=True))
    ]),
    'RandomForest': Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english')),
        ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
    ]),
    'GradientBoosting': Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english')),
        ('clf', GradientBoostingClassifier(n_estimators=100, random_state=42))
    ])
}

# Train, evaluate, and save each model
for model_name, pipeline in pipelines.items():
    print(f"Training and evaluating {model_name} on the filtered dataset...")

    # Train the model
    pipeline.fit(x_train, y_train)

    # Predict on the test set
    y_pred = pipeline.predict(x_test)

    # Evaluate the model
    accuracy = metrics.accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {accuracy:.2f}")
    print(f"{model_name} Classification Report:\n{metrics.classification_report(y_test, y_pred)}")

    # Save the trained model
    joblib.dump(pipeline, f'{model_name.lower()}_model.pkl')
    print(f"{model_name} model saved as {model_name.lower()}_model.pkl\n")


from google.colab import files
files.download('svm_model.pkl')
files.download('randomforest_model.pkl')
files.download('gradientboosting_model.pkl')


Original Class Distribution:
 Emotion
neutral       14715
sadness       11887
joy           11045
worry          8459
surprise       6249
fear           5410
happiness      5209
anger          4407
love           3842
fun            1776
relief         1526
hate           1323
disgust         856
empty           827
enthusiasm      759
boredom         179
shame           146
Name: count, dtype: int64
Filtered Class Distribution:
 Emotion
neutral      14715
sadness      11887
joy          11045
worry         8459
surprise      6249
fear          5410
happiness     5209
anger         4407
love          3842
fun           1776
relief        1526
hate          1323
Name: count, dtype: int64
Training and evaluating SVM on the filtered dataset...
SVM Accuracy: 0.44
SVM Classification Report:
              precision    recall  f1-score   support

       anger       0.68      0.45      0.54       896
        fear       0.72      0.57      0.64      1055
         fun       0.08      0.01      0

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>