In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [None]:
# import statements and understand what each library/module does:
#1. import pandas as pd: This line imports the Pandas library and assigns it the alias “pd”. Pandas is a powerful data manipulation library in Python. It provides data structures (such as DataFrames) for efficient handling of structured data.
#2. from sklearn.model_selection import train_test_split, GridSearchCV: Here, we’re importing two specific functions from the sklearn.model_selection module:
       # a) train_test_split: This function splits a dataset into training and testing subsets. It’s commonly used for evaluating machine learning models.
       # b) GridSearchCV: This class performs an exhaustive search over a specified parameter grid to find the best hyperparameters for a given model.
#3. from sklearn.feature_extraction.text import TfidfVectorizer: The TfidfVectorizer is used for text feature extraction. It converts a collection of text documents into a matrix of TF-IDF features (Term Frequency-Inverse Document Frequency)
#4. from sklearn.metrics import classification_report, confusion_matrix, accuracy_score: These are essential metrics for evaluating classification models:
       # a) classification_report: Provides precision, recall, F1-score, and support for each class.
       # b) confusion_matrix: Helps visualize true positive, true negative, false positive, and false negative predictions.
       # c) accuracy_score: Calculates the accuracy of a classification model.
#5. from imblearn.over_sampling import SMOTE: SMOTE (Synthetic Minority Over-sampling Technique) is used for oversampling the minority class in imbalanced datasets. It generates synthetic samples to balance class distribution.
#6. from sklearn.naive_bayes import MultinomialNB: This line imports the Multinomial Naive Bayes classifier. It’s commonly used for text classification tasks.
#7. from sklearn.linear_model import LogisticRegression: Logistic Regression is a linear model used for binary and multiclass classification. It estimates the probability of a binary outcome.
#8. from sklearn.ensemble import RandomForestClassifier: The Random Forest classifier is an ensemble method that combines multiple decision trees to improve predictive accuracy.
#9. from sklearn.svm import SVC: The Support Vector Machine (SVM) classifier is used for both classification and regression tasks. It finds the best hyperplane that separates data points into different classes.
#10. from sklearn.neighbors import KNeighborsClassifier: The K-Nearest Neighbors (KNN) classifier assigns a class label based on the majority class among its k nearest neighbors.
#11. import nltk: The Natural Language Toolkit (NLTK) is a library for natural language processing. It provides tools for text analysis, tokenization, stemming, and more
#12. import re: The re module is used for regular expressions. It allows you to work with patterns in strings.
#13. from nltk.corpus import stopwords: NLTK provides a list of common stopwords (words like “the,” “and,” “is,” etc.) that are often removed during text preprocessing.
#14. from nltk.stem import PorterStemmer, WordNetLemmatizer: These are text normalization techniques:
       # a) PorterStemmer: Reduces words to their root form (e.g., “running” becomes “run”).
       # b) WordNetLemmatizer: Similar to stemming but produces valid words (e.g., “better” becomes “good”).

In [12]:
# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yashu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yashu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [19]:
# Load dataset from CSV
data = pd.read_csv('Emotions_training.csv')

In [None]:
# data preprocessing is a crucial step in natural language processing (NLP) that involves cleaning and transforming unstructured text data to prepare it for analysis.
   # a) Lowercasing
   # b) Remove links
   # c) Remove next line
   # d) Remove words containing numbers
   # e) Remove extra spaces
   # f) Remove special characters
   # g) Remove stop words
   # h) Stemming
   # i) Lemmatization

In [20]:
# Data Preprocessing
def preprocess_text(text):
    text = text.lower()  # Lower Case
    text = re.sub(r'http\S+', '', text)  # Remove links
    text = re.sub(r'\n', ' ', text)  # Remove next lines
    text = re.sub(r'\w*\d\w*', '', text)  # Remove words containing numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stop words
    
    stemmer = PorterStemmer()
    text = ' '.join([stemmer.stem(word) for word in text.split()])  # Stemming
    
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])  # Lemmatization
    
    return text

data['text'] = data['text'].apply(preprocess_text)

In [None]:
#Feature engineering refers to the process of creating new features (variables) from existing data or transforming existing features to improve the performance of a machine learning model.

In [21]:
# Feature Engineering
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['text'])
y = data['label']

In [22]:
# Apply SMOTE to balance classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [23]:
# Train-Test-Validation Split
X_train, X_temp, y_train, y_temp = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.333, random_state=42)

In [24]:
# Model Building and Evaluation
def build_and_evaluate_model(model, param_grid):
    # Grid Search for Hyperparameter Tuning
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    y_train_pred = best_model.predict(X_train)
    y_test_pred = best_model.predict(X_test)
    y_val_pred = best_model.predict(X_val)
    
    print(f"Best Parameters: {grid_search.best_params_}")
    print("Training Classification Report:")
    print(classification_report(y_train, y_train_pred))
    print("Test Classification Report:")
    print(classification_report(y_test, y_test_pred))
    print("Validation Classification Report:")
    print(classification_report(y_val, y_val_pred))
    print("Confusion Matrix on Test Set:")
    print(confusion_matrix(y_test, y_test_pred))
    
    val_accuracy = accuracy_score(y_val, y_val_pred)
    
    return best_model, val_accuracy

In [None]:
#1.  Multinomial Naive Bayes
#2.  LogisticRegression
#3.  RandomForestClassifier
#4.  support vector classifier
#5.  KNeighborsClassifier

In [25]:
# Define models and hyperparameters
models = [
    (MultinomialNB(), {'alpha': [0.01, 0.1, 1]}),
    (LogisticRegression(max_iter=1000), {'C': [0.1, 1, 10]}),
    (RandomForestClassifier(), {'n_estimators': [50, 100], 'max_depth': [10, 20]}),
    (SVC(), {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}),
    (KNeighborsClassifier(), {'n_neighbors': [3, 5, 7]})
]

best_models = []
best_accuracy = 0
final_model = None

for model, param_grid in models:
    print(f"Evaluating {model.__class__.__name__}")
    best_model, val_accuracy = build_and_evaluate_model(model, param_grid)
    best_models.append((best_model, val_accuracy))
    
    if val_accuracy > best_accuracy:
        best_accuracy = val_accuracy
        final_model = best_model
    
    print("\n" + "="*80 + "\n")

print("Final Model Selected based on Validation Performance:")
print(final_model)

Evaluating MultinomialNB
Best Parameters: {'alpha': 1}
Training Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.93      0.94      3716
           1       0.95      0.88      0.91      3753
           2       0.93      0.97      0.95      3752
           3       0.95      0.95      0.95      3752
           4       0.95      0.93      0.94      3763
           5       0.92      1.00      0.96      3784

    accuracy                           0.94     22520
   macro avg       0.94      0.94      0.94     22520
weighted avg       0.94      0.94      0.94     22520

Test Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.86      0.89      1088
           1       0.89      0.82      0.85      1089
           2       0.89      0.95      0.92      1087
           3       0.92      0.92      0.92      1050
           4       0.93      0.90      0.92      1059
           5     