In [33]:
import warnings

import re
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import f1_score, ConfusionMatrixDisplay

warnings.filterwarnings('ignore')

In [2]:
train_data = pd.read_csv('Train Data.csv')
test_data = pd.read_csv('Test Data.csv')

In [3]:
# Initialisations
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    """
    Function to clean and preprocess text data.
    """
    # Convert to lowercase
    text = text.lower()

    # Remove special characters, numbers, and punctuation
    text = re.sub(r'[^a-z\s]', '', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Remove stopwords and lemmatise
    tokens = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]

    # Join tokens back to text
    return " ".join(tokens)

# Apply pre-processing to training and test data
train_data['Cleaned_Review'] = train_data['Review'].apply(preprocess_text)
test_data['Cleaned_Review'] = test_data['Review'].apply(preprocess_text)

In [4]:
train_data.drop(labels='Review', axis=1, inplace=True)
test_data.drop(labels='Review', axis=1, inplace=True)

In [None]:
display(train_data.head(2))
display(train_data.tail(2))
display(test_data.head(2))
display(test_data.tail(2))

In [9]:
X = train_data[['Cleaned_Review', 'Review_Title']]
y = train_data['Rating']

In [10]:
# Split the dataset into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.2, random_state=0)

In [None]:
X_train.isna().sum()

In [12]:
# Combine 'Cleaned_Review' and 'Review_Title' into a single text column
X_train = X_train['Review_Title'] + " " + X_train['Cleaned_Review']
X_test = X_test['Review_Title'] + " " + X_test['Review_Title']

In [None]:
X_train.head(2)

In [14]:
# Convert the reviews (text data) into numerical vectors using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
X_train.shape, X_test_tfidf.shape

In [16]:
test_X = test_data['Review_Title'] + " " + test_data['Cleaned_Review']
test_data_tfidf = vectorizer.transform(test_X)

In [None]:
test_data.shape, test_data_tfidf.shape

In [None]:
stratified_k_fold_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [None]:
# Define the objective function for Optuna
def objective(trial):
    # Define hyperparameter search space
    n_estimators = trial.suggest_int('n_estimators', 100, 1000, step=100)
    learning_rate = trial.suggest_categorical('learning_rate', [0.5, 1.0, 1.1, 1.2, 1.5, 2.0, 3.0])
    algorithm = trial.suggest_categorical('algorithm', ['SAMME', 'SAMME.R'])

    # Create the model
    model = AdaBoostClassifier(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        algorithm=algorithm
    )

    # Evaluate the model using cross-validation
    scores = cross_val_score(model, X_train_tfidf, y_train, cv=stratified_k_fold_cv, scoring='f1')
    return scores.mean()

# Run the optimisation
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

# Train and evaluate the best model
best_params = study.best_params
best_adaboost_model = AdaBoostClassifier(**best_params)
best_adaboost_model.fit(X_train_tfidf, y_train)

In [None]:
# Define the AdaBoost model
adaboost_model = AdaBoostClassifier()

# Parameter grid for AdaBoost
param_grid_adaboost = {
    'n_estimators': range(100, 1000, 100),  # Number of estimators
    'learning_rate': [0.01, 0.1, 0.5, 1.0],  # Learning rate
    'algorithm': ['SAMME', 'SAMME.R']  # Algorithm variants
}

# RandomizedSearchCV for AdaBoost
adaboost_random_search = RandomizedSearchCV(
    estimator=adaboost_model,
    param_distributions=param_grid_adaboost,
    n_iter=20,                         # Number of parameter settings to try
    scoring='f1',                      # Use F1 score for binary classification
    cv=stratified_k_fold_cv,           # Stratified K-Fold
    random_state=0,
    n_jobs=-1                          # Use all available processors
)

# Fit the model
adaboost_random_search.fit(X_train_tfidf, y_train)
best_adaboost_model = adaboost_random_search.best_estimator_

In [None]:
# Best parameters and score
print("Best Parameters:", adaboost_random_search.best_params_)
print("Best F1 Score:", adaboost_random_search.best_score_)

y_pred_adaboost = best_adaboost_model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred_adaboost))
print("Confusion Matrix", ConfusionMatrixDisplay(confusion_matrix(y_test, y_pred_adaboost)).plot())
print("F1 Score:", f1_score(y_test, y_pred_adaboost))

In [None]:
test_pred_custom = best_adaboost_model.predict(test_data_tfidf)

In [None]:
df = pd.DataFrame({
    'ID': test_data['ID'],
    'Rating': test_pred_custom
    })

display(df.head(2))
display(df.tail(2))

In [28]:
df.to_csv('NLP_Hackathon_JN_.csv', index=False)