## Importing Libraries

In [12]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import ssl
import os

In [10]:
from sklearn.linear_model import LogisticRegression
import optuna
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import joblib
from sklearn.metrics import classification_report, accuracy_score

## Config

In [4]:
DATA_FILE_PATH = './data/train.csv'
BASE_MODEL_PATH = './models/'

RANDOM_STATE = 98

## Utils

In [5]:
# Download necessary NLTK datasets
def download_nltk_resources():
    try:
        _create_unverified_https_context = ssl._create_unverified_context
    except AttributeError:
        pass
    else:
        ssl._create_default_https_context = _create_unverified_https_context
    
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')
    nltk.download('omw-1.4')

# Text cleaning function
def clean_text(text):
    text = re.sub(r'<.*?>', '', text) 
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) 
    text = re.sub(r'@\w+', '', text)  
    text = re.sub(r'#\w+', '', text)  
    text = re.sub(r'\W', ' ', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    return text

# Function to remove stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return ' '.join(filtered_text)

# Function for lemmatizing words in the text
def lemmatize_words(text):
    lemmatizer = WordNetLemmatizer()
    word_tokens = word_tokenize(text)
    lemmatized_text = ' '.join([lemmatizer.lemmatize(word) for word in word_tokens])
    return lemmatized_text

# Main preprocessing function
def preprocess_data(df, text_column_name='text', target_column_name='sentiment', columns_to_remove=[]):
    df[text_column_name] = df[text_column_name].astype(str)
    df[text_column_name] = df[text_column_name].apply(clean_text)
    df[text_column_name] = df[text_column_name].apply(remove_stopwords)
    df[text_column_name] = df[text_column_name].apply(lemmatize_words)
    df[target_column_name] = df[target_column_name].map({"negative": 0, "neutral": 1, "positive": 2})

    # Remove unnecessary columns
    df = df.drop(columns=columns_to_remove, errors='ignore')
    df = df.dropna()
    return df

In [11]:
def load_model(model_path):
    """
    Load a saved model from a specified path.
    
    Parameters:
    - model_path: str, path to the saved model (.pkl file)
    
    Returns:
    - Loaded model
    """
    return joblib.load(model_path)

def evaluate_model(model, X_test, y_test):
    """
    Evaluate the model on the test set and print the classification report and accuracy.
    
    Parameters:
    - model: The machine learning model to evaluate
    - X_test: Features of the test set
    - y_test: True labels of the test set
    """
    predictions = model.predict(X_test)
    
    print("Classification Report:")
    print(classification_report(y_test, predictions))
    
    accuracy = accuracy_score(y_test, predictions)
    print(f"Accuracy: {accuracy:.4f}")

## Data Preparation

In [6]:
download_nltk_resources()

[nltk_data] Downloading package punkt to /Users/raoofmac/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/raoofmac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/raoofmac/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/raoofmac/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [7]:
# Load data
df = pd.read_csv(DATA_FILE_PATH, encoding='ISO-8859-1')
df.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26


In [8]:
columns_to_remove = ['textID', 'Time of Tweet', 'selected_text', 'Age of User', 'Country', 'Population -2020', 'Land Area (Km²)', 'Density (P/Km²)']

# Preprocess data
df_processed = preprocess_data(df, text_column_name='text', columns_to_remove=columns_to_remove)
df_processed.head()

Unnamed: 0,text,sentiment
0,responded going,1
1,sooo sad miss san diego,0
2,bos bullying,0
3,interview leave alone,0
4,son put release already bought,0


In [9]:
x_train, x_test, y_train, y_test = train_test_split(df_processed['text'], df_processed['sentiment'], test_size=0.2, random_state=RANDOM_STATE)

## Inference with Logistic Regression

In [14]:
lr_path = os.path.join(BASE_MODEL_PATH, 'lr_model.pkl')
print(f"Evaluating model: Logistic Regression")
model = load_model(lr_path)
evaluate_model(model, x_test, y_test)
print("-" * 80) 

Evaluating model: Logistic Regression
Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.60      0.66      1525
           1       0.65      0.76      0.70      2225
           2       0.79      0.74      0.76      1747

    accuracy                           0.71      5497
   macro avg       0.72      0.70      0.71      5497
weighted avg       0.72      0.71      0.71      5497

Accuracy: 0.7111
--------------------------------------------------------------------------------


## Inference with Multinomial Naive Bayes

In [15]:
lr_path = os.path.join(BASE_MODEL_PATH, 'multinomialNB_model.pkl')
print(f"Evaluating model: Multinomical Naive Bayes")
model = load_model(lr_path)
evaluate_model(model, x_test, y_test)
print("-" * 80) 

Evaluating model: Multinomical Naive Bayes
Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.49      0.59      1525
           1       0.57      0.79      0.66      2225
           2       0.76      0.60      0.67      1747

    accuracy                           0.65      5497
   macro avg       0.69      0.63      0.64      5497
weighted avg       0.67      0.65      0.64      5497

Accuracy: 0.6460
--------------------------------------------------------------------------------


## Inference with RandomForest Classifier

In [18]:
lr_path = os.path.join(BASE_MODEL_PATH, 'random_forestmodel.pkl')
print(f"Evaluating model: Random Forest Model")
model = load_model(lr_path)
evaluate_model(model, x_test, y_test)
print("-" * 80) 

Evaluating model: Random Forest Model
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.53      0.62      1525
           1       0.64      0.79      0.70      2225
           2       0.76      0.73      0.74      1747

    accuracy                           0.70      5497
   macro avg       0.72      0.68      0.69      5497
weighted avg       0.71      0.70      0.69      5497

Accuracy: 0.6967
--------------------------------------------------------------------------------
