In [9]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, SpatialDropout1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, GlobalMaxPool1D
from transformers import BertTokenizer, TFBertModel
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
import matplotlib.pyplot as plt

ImportError: cannot import name 'ExtraTreesClassifier' from 'sklearn.tree' (D:\anni\Lib\site-packages\sklearn\tree\__init__.py)

In [11]:
# Load the datasets
reviews_df = pd.read_csv('reviews.csv')
Emotions_traning_df = pd.read_csv('Emotions_traning.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'Emotions_traning.csv'

In [None]:
# Preprocessing function
def preprocess_data(data):
    # Convert text to lowercase
    data['text'] = data['text'].str.lower()

    # Remove special characters
    data['text'] = data['text'].str.replace('[^\w\s]','')

    # Remove URLs
    data['text'] = data['text'].str.replace('http\S+', '')

    # Remove numbers
    data['text'] = data['text'].str.replace('\d+', '')

    # Remove punctuations
    data['text'] = data['text'].str.replace('[%s]' % re.escape(string.punctuation), '')

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    data['text'] = data['text'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    data['text'] = data['text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

    return data

In [None]:
# Preprocess the datasets
reviews_df = preprocess_data(reviews_df)
emotional_reviews_df = preprocess_data(emotional_reviews_df)


In [None]:
# Split the datasets into train and test sets
X_train, X_test, y_train, y_test = train_test_split(reviews_df['text'], reviews_df['sentiment'], test_size=0.2, random_state=42)
X_train_emotional, X_test_emotional, y_train_emotional, y_test_emotional = train_test_split(emotional_reviews_df['text'], emotional_reviews_df['sentiment'], test_size=0.2, random_state=42)


In [None]:
# Define the text classification models for balanced datasets
models = {
    'Decision Tree': DecisionTreeClassifier(),
    'SVM': SVC(probability=True),
    'KNN': KNeighborsClassifier(),
    'Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(),
    'XGBoost': XGBClassifier(),
    'LSTM': Sequential([
        Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length),
        SpatialDropout1D(0.4),
        LSTM(128, dropout=0.2, recurrent_dropout=0.2),
        Dense(1, activation='sigmoid')
    ]),
    'BERT': TFBertModel.from_pretrained('bert-base-uncased', num_labels=1),
    'CNN': Sequential([
        Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length),
        Conv1D(filters=32, kernel_size=3, activation='relu'),
        MaxPooling1D(pool_size=2),
        Flatten(),
        Dense(128, activation='relu'),
        Dense(1, activation='sigmoid')
    ]),
    'RNN': Sequential([
        Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length),
        LSTM(128, dropout=0.2, recurrent_dropout=0.2),
        Dense(1, activation='sigmoid')
    ])
}


In [None]:
# Define the text classification models for imbalanced datasets
imbalanced_models = {
    'Decision Tree': DecisionTreeClassifier(),
    'Light GBM': LGBMClassifier(),
    'XGBoost': XGBClassifier(),
    'LSTM': Sequential([
        Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length),
        SpatialDropout1D(0.4),
        LSTM(128, dropout=0.2, recurrent_dropout=0.2),
        Dense(1, activation='sigmoid')
    ]),
    'BERT': TFBertModel.from_pretrained('bert-base-uncased', num_labels=1),
    'CNN': Sequential([
        Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length),
        Conv1D(filters=32, kernel_size=3, activation='relu'),
        MaxPooling1D(pool_size=2),
        Flatten(),
        Dense(128, activation='relu'),
        Dense(1, activation='sigmoid')
    ]),
    'RNN': Sequential([
        Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length),
        LSTM(128, dropout=0.2, recurrent_dropout=0.2),
        Dense(1, activation='sigmoid')
    ])
}

In [None]:
# Train the models for balanced datasets
for model_name, model in models.items():
    # Fit the model
    model.fit(X_train, y_train)

    # Evaluate the model
    y_pred = model.predict(X_test)
    print(f"Model: {model_name}")
    print(classification_report(y_test, y_pred))
    print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")
    print()

In [None]:
# Train the models for imbalanced datasets
for model_name, model in imbalanced_models.items():
    # Fit the model
    model.fit(X_train_emotional, y_train_emotional)

    # Evaluate the model
    y_pred = model.predict(X_test_emotional)
    print(f"Model: {model_name}")
    print(classification_report(y_test_emotional, y_pred))
    print(f"Confusion Matrix:\n{confusion_matrix(y_test_emotional, y_pred)}")
    print()

In [None]:
# Data visualization for reviews dataset
plt.figure(figsize=(10, 6))
sns.countplot(x='sentiment', data=reviews_df)
plt.title('Sentiment Distribution in Reviews Dataset')
plt.show()

plt.figure(figsize=(10, 6))
sns.distplot(reviews_df['text'].str.len(), bins=50)
plt.title('Length Distribution of Text in Reviews Dataset')
plt.show()

# Data visualization for emotional reviews dataset
plt.figure(figsize=(10, 6))
sns.countplot(x='sentiment', data=emotional_reviews_df)
plt.title('Sentiment Distribution in Emotional Reviews Dataset')
plt.show()

plt.figure(figsize=(10, 6))
sns.distplot(emotional_reviews_df['text'].str.len(), bins=50)
plt.title('Length Distribution of Text in Emotional Reviews Dataset')
plt.show()