In [1]:
%reload_ext autoreload
%autoreload 2 

In [2]:
import os
import sys

# Get the directory of the script being run
current_dir = os.path.dirname(os.path.abspath("__file__"))

# Get the parent directory of the current directory
parent_dir = os.path.dirname(current_dir)
# Add the parent directory to sys.path to make the preprocessing module discoverable
sys.path.append(parent_dir)

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import optuna
from preprocessing.text_preprocessing import preprocess_data
from utils.model_utils import save_model



  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rakshitgupta/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rakshitgupta/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rakshitgupta/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/rakshitgupta/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [4]:
df = pd.read_csv('../data/train.csv', encoding='ISO-8859-1')
columns_to_remove = ['textID', 'Time of Tweet', 'selected_text', 'Age of User', 'Country', 'Population -2020', 'Land Area (Km²)', 'Density (P/Km²)']
df_processed = preprocess_data(df, text_column_name='text', columns_to_remove=columns_to_remove)


In [5]:
x_train, x_test, y_train, y_test = train_test_split(df_processed['text'], df_processed['sentiment'], test_size=0.2, random_state=1)


In [6]:
vectorization = TfidfVectorizer()
XV_train = vectorization.fit_transform(x_train)
XV_test = vectorization.transform(x_test)

In [7]:
lr = LogisticRegression(n_jobs=-1)
lr.fit(XV_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(n_jobs=-1)

In [8]:
pred_lr=lr.predict(XV_test)


In [9]:
# get accuracy score
score_lr = accuracy_score(y_test, pred_lr)
score_lr

0.6980171002364927

In [10]:
# Define the objective function for hyperparameter tuning
def objective(trial):
    # Hyperparameters to tune for TfidfVectorizer
    max_df = trial.suggest_float('max_df', 0.5, 1.0)
    min_df = trial.suggest_int('min_df', 1, 5)
    max_features = trial.suggest_categorical('max_features', [None, 5000, 10000, 20000])
    
    # Hyperparameters to tune for LogisticRegression
    C = trial.suggest_float('C', 1e-4, 10.0)
    l1_ratio = trial.suggest_float('l1_ratio', 0, 1)  
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2', 'elasticnet'])
    
    # Setup the TfidfVectorizer and LogisticRegression within a pipeline
    tfidf_vectorizer = TfidfVectorizer(
        lowercase=True, 
        ngram_range=(1, 2),  
        max_df=max_df,
        min_df=min_df,
        max_features=max_features
    )
    model_lr = LogisticRegression(
        C=C,
        penalty=penalty,
        l1_ratio=l1_ratio if penalty == 'elasticnet' else None,
        solver='saga',
        multi_class='multinomial',
        random_state=1
    )
    
    pipeline = make_pipeline(tfidf_vectorizer, model_lr)
    pipeline.fit(x_train, y_train)
    
    # Predict and calculate accuracy
    predictions = pipeline.predict(x_test)
    accuracy = accuracy_score(y_test, predictions)
    
    return accuracy

In [11]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=15)

best_params = study.best_params
best_accuracy = study.best_value
print(f"Best parameters: {best_params}")
print(f"Best accuracy: {best_accuracy}")


[I 2024-03-30 22:50:18,247] A new study created in memory with name: no-name-d2955699-d2e6-416b-b450-96dff6523ae8
[I 2024-03-30 22:51:38,107] Trial 0 finished with value: 0.704929961797344 and parameters: {'max_df': 0.900683805823834, 'min_df': 2, 'max_features': 5000, 'C': 2.57001010455713, 'l1_ratio': 0.7630220829433977, 'penalty': 'elasticnet'}. Best is trial 0 with value: 0.704929961797344.
[I 2024-03-30 22:51:39,960] Trial 1 finished with value: 0.6851009641622703 and parameters: {'max_df': 0.6329438122455562, 'min_df': 5, 'max_features': None, 'C': 0.2730036828352817, 'l1_ratio': 0.010462947281870805, 'penalty': 'l1'}. Best is trial 0 with value: 0.704929961797344.
[I 2024-03-30 22:51:41,596] Trial 2 finished with value: 0.674185919592505 and parameters: {'max_df': 0.9108562912898472, 'min_df': 3, 'max_features': 20000, 'C': 7.532346393691134, 'l1_ratio': 0.5104469573115292, 'penalty': 'l2'}. Best is trial 0 with value: 0.704929961797344.
[I 2024-03-30 22:55:16,886] Trial 3 finis

Best parameters: {'max_df': 0.7765368470565337, 'min_df': 4, 'max_features': 20000, 'C': 1.6274446783998893, 'l1_ratio': 0.17244200389236075, 'penalty': 'l1'}
Best accuracy: 0.7132981626341641


In [12]:
best_pipeline = make_pipeline(
    TfidfVectorizer(
        lowercase=True,  
        ngram_range=(1, 2),  
        max_df=best_params['max_df'],
        min_df=best_params['min_df'],
        max_features=best_params['max_features']
    ),
    LogisticRegression(
        C=best_params['C'],
        penalty=best_params['penalty'],
        l1_ratio=best_params['l1_ratio'] if best_params['penalty'] == 'elasticnet' else None,
        solver='saga',
        multi_class='multinomial',
        random_state=1
    )
)
best_pipeline.fit(x_train, y_train)



Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(max_df=0.7765368470565337, max_features=20000,
                                 min_df=4, ngram_range=(1, 2))),
                ('logisticregression',
                 LogisticRegression(C=1.6274446783998893,
                                    multi_class='multinomial', penalty='l1',
                                    random_state=1, solver='saga'))])

In [13]:
# Save the best model
save_model(best_pipeline, '../models/logisticRegression_classifier.pkl')