In [1]:
%reload_ext autoreload
%autoreload 2 

In [2]:
import os
import sys

# Get the directory of the script being run
current_dir = os.path.dirname(os.path.abspath("__file__"))

# Get the parent directory of the current directory
parent_dir = os.path.dirname(current_dir)
# Add the parent directory to sys.path to make the preprocessing module discoverable
sys.path.append(parent_dir)

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import optuna
from preprocessing.text_preprocessing import preprocess_data
from utils.model_utils import save_model



  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rakshitgupta/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rakshitgupta/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rakshitgupta/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/rakshitgupta/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [4]:
df = pd.read_csv('../data/train.csv', encoding='ISO-8859-1')
columns_to_remove = ['textID', 'Time of Tweet', 'selected_text', 'Age of User', 'Country', 'Population -2020', 'Land Area (Km²)', 'Density (P/Km²)']
df_processed = preprocess_data(df, text_column_name='text', columns_to_remove=columns_to_remove)


In [5]:
x_train, x_test, y_train, y_test = train_test_split(df_processed['text'], df_processed['sentiment'], test_size=0.2, random_state=1)

# Define the objective function for hyperparameter tuning
def objective(trial):
    # Hyperparameters to tune
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 10, 100)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    
    # Setup the TfidfVectorizer and RandomForestClassifier within a pipeline
    tfidf_vectorizer = TfidfVectorizer(
        lowercase=trial.suggest_categorical('lowercase', [True, False]),
        ngram_range=trial.suggest_categorical('ngram_range', [(1, 1), (1, 2)]),
        max_df=trial.suggest_float('max_df', 0.5, 1.0),
        min_df=trial.suggest_int('min_df', 1, 5),
        max_features=trial.suggest_categorical('max_features', [None, 5000, 10000, 20000])
    )
    model_rf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=1
    )
    
    pipeline = make_pipeline(tfidf_vectorizer, model_rf)
    pipeline.fit(x_train, y_train)
    
    # Predict and calculate accuracy
    predictions = pipeline.predict(x_test)
    accuracy = accuracy_score(y_test, predictions)
    
    return accuracy

# Create an Optuna study and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

[I 2024-03-30 22:24:09,289] A new study created in memory with name: no-name-573bc1de-055b-4c16-bcd6-eb7058e26435


[I 2024-03-30 22:24:15,908] Trial 0 finished with value: 0.4522466800072767 and parameters: {'n_estimators': 400, 'max_depth': 20, 'min_samples_split': 9, 'min_samples_leaf': 6, 'lowercase': False, 'ngram_range': (1, 2), 'max_df': 0.5925090864756992, 'min_df': 1, 'max_features': None}. Best is trial 0 with value: 0.4522466800072767.
[I 2024-03-30 22:24:20,065] Trial 1 finished with value: 0.5952337638712025 and parameters: {'n_estimators': 222, 'max_depth': 20, 'min_samples_split': 3, 'min_samples_leaf': 4, 'lowercase': False, 'ngram_range': (1, 1), 'max_df': 0.8085367467973505, 'min_df': 2, 'max_features': 5000}. Best is trial 1 with value: 0.5952337638712025.
[I 2024-03-30 22:24:27,156] Trial 2 finished with value: 0.5222848826632709 and parameters: {'n_estimators': 506, 'max_depth': 11, 'min_samples_split': 6, 'min_samples_leaf': 1, 'lowercase': False, 'ngram_range': (1, 1), 'max_df': 0.7393123783054124, 'min_df': 5, 'max_features': 10000}. Best is trial 1 with value: 0.595233763871

In [6]:
best_params = study.best_params
best_accuracy = study.best_value
print(f"Best parameters: {best_params}")
print(f"Best accuracy: {best_accuracy}")

Best parameters: {'n_estimators': 475, 'max_depth': 100, 'min_samples_split': 10, 'min_samples_leaf': 1, 'lowercase': True, 'ngram_range': (1, 1), 'max_df': 0.9973866428109031, 'min_df': 2, 'max_features': 10000}
Best accuracy: 0.6934691649990904


In [7]:
# Train the final model using the best parameters found by Optuna
best_pipeline = make_pipeline(
    TfidfVectorizer(
        lowercase=best_params['lowercase'],
        ngram_range=best_params['ngram_range'],
        max_df=best_params['max_df'],
        min_df=best_params['min_df'],
        max_features=best_params['max_features']
    ),
    RandomForestClassifier(
        n_estimators=best_params['n_estimators'],
        max_depth=best_params['max_depth'],
        min_samples_split=best_params['min_samples_split'],
        min_samples_leaf=best_params['min_samples_leaf'],
        random_state=1
    )
)
best_pipeline.fit(x_train, y_train)

Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(max_df=0.9973866428109031, max_features=10000,
                                 min_df=2)),
                ('randomforestclassifier',
                 RandomForestClassifier(max_depth=100, min_samples_split=10,
                                        n_estimators=475, random_state=1))])

In [8]:
# Save the best model
save_model(best_pipeline, '../models/randomForest_classifier.pkl')
