## Importing Libraries

In [25]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import ssl

In [67]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import optuna
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import joblib

### Configurations

In [40]:
DATA_FILE_PATH = './data/train.csv'
BASE_MODEL_PATH = './models/'

RANDOM_STATE = 98

### Utility Functions

In [28]:
# Download necessary NLTK datasets
def download_nltk_resources():
    try:
        _create_unverified_https_context = ssl._create_unverified_context
    except AttributeError:
        pass
    else:
        ssl._create_default_https_context = _create_unverified_https_context
    
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')
    nltk.download('omw-1.4')

# Text cleaning function
def clean_text(text):
    text = re.sub(r'<.*?>', '', text) 
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) 
    text = re.sub(r'@\w+', '', text)  
    text = re.sub(r'#\w+', '', text)  
    text = re.sub(r'\W', ' ', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    return text

# Function to remove stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return ' '.join(filtered_text)

# Function for lemmatizing words in the text
def lemmatize_words(text):
    lemmatizer = WordNetLemmatizer()
    word_tokens = word_tokenize(text)
    lemmatized_text = ' '.join([lemmatizer.lemmatize(word) for word in word_tokens])
    return lemmatized_text

# Main preprocessing function
def preprocess_data(df, text_column_name='text', target_column_name='sentiment', columns_to_remove=[]):
    df[text_column_name] = df[text_column_name].astype(str)
    df[text_column_name] = df[text_column_name].apply(clean_text)
    df[text_column_name] = df[text_column_name].apply(remove_stopwords)
    df[text_column_name] = df[text_column_name].apply(lemmatize_words)
    df[target_column_name] = df[target_column_name].map({"negative": 0, "neutral": 1, "positive": 2})

    # Remove unnecessary columns
    df = df.drop(columns=columns_to_remove, errors='ignore')
    df = df.dropna()
    return df

### Data Preparation

In [29]:
download_nltk_resources()

[nltk_data] Downloading package punkt to /Users/raoofmac/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/raoofmac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/raoofmac/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/raoofmac/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [30]:
# Load data
df = pd.read_csv(DATA_FILE_PATH, encoding='ISO-8859-1')
df.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26


In [31]:
columns_to_remove = ['textID', 'Time of Tweet', 'selected_text', 'Age of User', 'Country', 'Population -2020', 'Land Area (Km²)', 'Density (P/Km²)']

# Preprocess data
df_processed = preprocess_data(df, text_column_name='text', columns_to_remove=columns_to_remove)
df_processed.head()

Unnamed: 0,text,sentiment
0,responded going,1
1,sooo sad miss san diego,0
2,bos bullying,0
3,interview leave alone,0
4,son put release already bought,0


In [32]:
x_train, x_test, y_train, y_test = train_test_split(df_processed['text'], df_processed['sentiment'], test_size=0.2, random_state=RANDOM_STATE)

In [33]:
# Initializing TF-IDF Vectorizer with all parameters defined
tfidf_vectorizer = TfidfVectorizer(
    input='content',               # Expected input type
    encoding='utf-8',              # Character encoding for input
    decode_error='strict',         # Action on decoding errors
    strip_accents=None,            # Accent stripping method
    lowercase=True,                # Convert all characters to lowercase
    preprocessor=None,             # Custom preprocessor function
    tokenizer=None,                # Custom tokenizer function
    analyzer='word',               # Analyzer type ('word', 'char', or 'char_wb')
    stop_words=None,               # Stop words list
    token_pattern=r'(?u)\b\w\w+\b',# Token pattern for word tokenization
    ngram_range=(1, 1),            # Range for n-grams to be extracted
    max_df=1.0,                    # Max document frequency for filtering terms
    min_df=1,                      # Min document frequency for filtering terms
    max_features=None,             # Max number of terms to consider
    vocabulary=None,               # Custom vocabulary
    binary=False,                  # Binary output (term presence/absence)
    dtype=float,                   # Data type of the matrix elements
    norm='l2',                     # Norm used for normalization
    use_idf=True,                  # Enable IDF weighting
    smooth_idf=True,               # Apply smoothing to IDF weights
    sublinear_tf=False             # Apply sublinear TF scaling
)

# Transforming the training and test text data into TF-IDF vectors
train_text_tfidf = tfidf_vectorizer.fit_transform(x_train)
test_text_tfidf = tfidf_vectorizer.transform(x_test)



### Logistic Regression

In [34]:
# Initializing the Logistic Regression model with explicit parameters
logistic_regression_model = LogisticRegression(n_jobs=-1, random_state=RANDOM_STATE)
# Fitting the model on the training data
logistic_regression_model.fit(train_text_tfidf, y_train)

In [35]:
# Making predictions on the test data
predictions_lr = logistic_regression_model.predict(test_text_tfidf)

# Calculating the accuracy score of the logistic regression model
accuracy_lr = accuracy_score(y_test, predictions_lr)
print(f"Logistic Regression Model Accuracy: {accuracy_lr}")

Logistic Regression Model Accuracy: 0.694742586865563


In [38]:
def objective(trial):
    # Hyperparameters to tune for TfidfVectorizer
    max_df = trial.suggest_float('max_df', 0.5, 1.0, log=True)
    min_df = trial.suggest_int('min_df', 1, 5)
    max_features = trial.suggest_categorical('max_features', [None, 5000, 10000, 20000])
    ngram_range = trial.suggest_categorical('ngram_range', [(1, 1), (1, 2), (1, 3)])
    
    # Hyperparameters to tune for LogisticRegression
    C = trial.suggest_loguniform('C', 1e-4, 10.0)
    l1_ratio = trial.suggest_float('l1_ratio', 0, 1)
    penalty = trial.suggest_categorical('penalty', ['l2', 'elasticnet'])
    class_weight = trial.suggest_categorical('class_weight', [None, 'balanced'])

    # Adjust solver based on penalty
    if penalty == 'elasticnet':
        solver = 'saga'
    else:
        solver = 'saga'  # 'saga' is compatible with 'l2' and 'elasticnet' penalties

    # Setup the TfidfVectorizer
    tfidf_vectorizer = TfidfVectorizer(
        lowercase=True, 
        ngram_range=ngram_range,
        max_df=max_df,
        min_df=min_df,
        max_features=max_features
    )

    # Setup the LogisticRegression model
    model_lr = LogisticRegression(
        C=C,
        penalty=penalty,
        l1_ratio=l1_ratio if penalty == 'elasticnet' else None,
        solver=solver,
        multi_class='multinomial',
        class_weight=class_weight,
        random_state=42,
        max_iter=1000
    )

    pipeline = make_pipeline(tfidf_vectorizer, model_lr)
    pipeline.fit(x_train, y_train)

    # Predict and calculate accuracy
    predictions = pipeline.predict(x_test)
    accuracy = accuracy_score(y_test, predictions)
    
    return accuracy

# Create a study object and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

[I 2024-04-02 23:23:11,336] A new study created in memory with name: no-name-520045c1-102b-4526-a219-92c54ab21cd7
  C = trial.suggest_loguniform('C', 1e-4, 10.0)
[I 2024-04-02 23:23:11,826] Trial 0 finished with value: 0.6814626159723486 and parameters: {'max_df': 0.5300704002676773, 'min_df': 5, 'max_features': None, 'ngram_range': (1, 2), 'C': 0.0672074906558378, 'l1_ratio': 0.5637524004825664, 'penalty': 'l2', 'class_weight': 'balanced'}. Best is trial 0 with value: 0.6814626159723486.
  C = trial.suggest_loguniform('C', 1e-4, 10.0)
[I 2024-04-02 23:23:12,041] Trial 1 finished with value: 0.42059305075495723 and parameters: {'max_df': 0.6989547168412739, 'min_df': 3, 'max_features': 20000, 'ngram_range': (1, 1), 'C': 0.0016220069792380679, 'l1_ratio': 0.6842675600370931, 'penalty': 'l2', 'class_weight': None}. Best is trial 0 with value: 0.6814626159723486.
  C = trial.suggest_loguniform('C', 1e-4, 10.0)
[I 2024-04-02 23:24:27,584] Trial 2 finished with value: 0.6872839730762234 and

Best Trial:
 Accuracy: 0.7111151537202111
 Params: {'max_df': 0.9322612407035106, 'min_df': 3, 'max_features': None, 'ngram_range': (1, 2), 'C': 1.6554365358749108, 'l1_ratio': 0.968405717098253, 'penalty': 'elasticnet', 'class_weight': None}


In [43]:
print("Best Trial:")
print(" Accuracy:", study.best_value)
print(" Params:", study.best_params)

Best Trial:
 Accuracy: 0.7111151537202111
 Params: {'max_df': 0.9322612407035106, 'min_df': 3, 'max_features': None, 'ngram_range': (1, 2), 'C': 1.6554365358749108, 'l1_ratio': 0.968405717098253, 'penalty': 'elasticnet', 'class_weight': None}


In [45]:
best_params = study.best_params

In [47]:
tfidf_vectorizer = TfidfVectorizer(
    lowercase=True, 
    ngram_range=best_params['ngram_range'],
    max_df=best_params['max_df'],
    min_df=best_params['min_df'],
    max_features=best_params['max_features']
)

model_lr = LogisticRegression(
    C=best_params['C'],
    penalty=best_params['penalty'],
    l1_ratio=best_params['l1_ratio'] if best_params['penalty'] == 'elasticnet' else None,
    solver='saga',  # Assuming 'saga' solver for compatibility
    multi_class='multinomial',
    class_weight=best_params['class_weight'],
    random_state=42,
    max_iter=1000
)

pipeline = make_pipeline(tfidf_vectorizer, model_lr)

# Train the pipeline on the full dataset
pipeline.fit(x_train, y_train)

In [50]:
joblib.dump(pipeline, f"{BASE_MODEL_PATH}lr_model.pkl")

['./models/ + lr_model.pkl']

### Multinomial Naive Bayes Classifier

In [52]:
# Initialize the MultinomialNB model with full parameters specified
model_mnb = MultinomialNB(
    alpha=1.0, # Additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing)
    fit_prior=True, # Whether to learn class prior probabilities or not. If false, a uniform prior will be used
    class_prior=None # Prior probabilities of the classes. If specified, the priors are not adjusted according to the data
)

# Fit the model to the transformed training data
model_mnb.fit(train_text_tfidf, y_train)

# Make predictions on the transformed test data
predictions = model_mnb.predict(test_text_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy}')

Accuracy: 0.6265235583045298


In [60]:
def objective(trial):
    # Assuming X_train and X_test are your original text data, and y_train and y_test are the labels
    
    # TfidfVectorizer hyperparameters
    tfidf_max_df = trial.suggest_float('tfidf_max_df', 0.7, 1.0)
    tfidf_min_df = trial.suggest_int('tfidf_min_df', 1, 5)
    ngram_range = trial.suggest_categorical('ngram_range', [(1, 1), (1, 2), (1, 3)])
    
    # MultinomialNB hyperparameters
    alpha = trial.suggest_float('alpha', 1e-3, 1.0)
    
    # Create the TfidfVectorizer and MultinomialNB within a pipeline
    vectorizer = TfidfVectorizer(max_df=tfidf_max_df, min_df=tfidf_min_df, ngram_range=ngram_range)
    model_mnb = MultinomialNB(alpha=alpha)
    
    pipeline = make_pipeline(vectorizer, model_mnb)
    pipeline.fit(x_train, y_train)

    # Predict and calculate accuracy
    predictions = pipeline.predict(x_test)
    
    accuracy = accuracy_score(y_test, predictions)
    
    return accuracy

In [61]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

print("Best trial:")
print(" Accuracy:", study.best_trial.value)
print(" Best hyperparameters:", study.best_trial.params)

[I 2024-04-03 00:51:16,386] A new study created in memory with name: no-name-2a2eb105-01ca-43f6-a06b-6e5d4053b1ce
[I 2024-04-03 00:51:16,821] Trial 0 finished with value: 0.6376205202837911 and parameters: {'tfidf_max_df': 0.8999306032406962, 'tfidf_min_df': 4, 'ngram_range': (1, 1), 'alpha': 0.21600901488063343}. Best is trial 0 with value: 0.6376205202837911.
[I 2024-04-03 00:51:17,413] Trial 1 finished with value: 0.6019647080225577 and parameters: {'tfidf_max_df': 0.924175062980853, 'tfidf_min_df': 1, 'ngram_range': (1, 3), 'alpha': 0.0847045094388592}. Best is trial 0 with value: 0.6376205202837911.
[I 2024-04-03 00:51:17,663] Trial 2 finished with value: 0.6428961251591777 and parameters: {'tfidf_max_df': 0.9930087141070105, 'tfidf_min_df': 4, 'ngram_range': (1, 2), 'alpha': 0.8126356570194263}. Best is trial 2 with value: 0.6428961251591777.
[I 2024-04-03 00:51:17,914] Trial 3 finished with value: 0.6347098417318537 and parameters: {'tfidf_max_df': 0.9061636249345315, 'tfidf_min

Best trial:
 Accuracy: 0.6459887211206112
 Best hyperparameters: {'tfidf_max_df': 0.7087869134447049, 'tfidf_min_df': 4, 'ngram_range': (1, 3), 'alpha': 0.9738973276185401}


In [62]:
best_params = study.best_params

In [64]:
best_params = study.best_trial.params
vectorizer = TfidfVectorizer(
    max_df=best_params['tfidf_max_df'], 
    min_df=best_params['tfidf_min_df'], 
    ngram_range=best_params['ngram_range']
)
model_mnb = MultinomialNB(alpha=best_params['alpha'])

pipeline = make_pipeline(vectorizer, model_mnb)
pipeline.fit(x_train, y_train)

In [65]:
joblib.dump(pipeline, f"{BASE_MODEL_PATH}multinomialNB_model.pkl")

['multinomialNB_model.pkl']

### Random Forest Classifier

In [70]:
# Initialize the RandomForestClassifier model with full parameters specified
model_rf = RandomForestClassifier(
    n_estimators=100,  # Number of trees in the forest
    criterion='gini',  # Function to measure the quality of a split
    max_depth=None,  # Maximum depth of the tree
    min_samples_split=2,  # Minimum number of samples required to split an internal node
    min_samples_leaf=1,  # Minimum number of samples required to be at a leaf node
    min_weight_fraction_leaf=0.0,  # Minimum weighted fraction of the sum total of weights required to be at a leaf node
    max_features='sqrt',  # Number of features to consider when looking for the best split; 'sqrt' is equivalent to 'auto'
    max_leaf_nodes=None,  # Grow trees with max_leaf_nodes in best-first fashion
    min_impurity_decrease=0.0,  # A node will be split if this split induces a decrease of the impurity greater than or equal to this value
    bootstrap=True,  # Whether bootstrap samples are used when building trees
    oob_score=False,  # Whether to use out-of-bag samples to estimate the generalization accuracy
    n_jobs=None,  # Number of jobs to run in parallel
    random_state=42,  # Controls both the randomness of the bootstrapping of the samples used when building trees
    verbose=0,  # Controls the verbosity when fitting and predicting
    warm_start=False,  # When set to True, reuse the solution of the previous call to fit and add more estimators to the ensemble
    class_weight=None,  # Weights associated with classes in the form {class_label: weight}
    ccp_alpha=0.0,  # Complexity parameter used for Minimal Cost-Complexity Pruning
    max_samples=None  # If bootstrap is True, the number of samples to draw from X to train each base estimator
)

# Fit the model to the transformed training data
model_rf.fit(train_text_tfidf, y_train)

# Make predictions on the transformed test data
predictions = model_rf.predict(test_text_tfidf)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print(f'Random Forest Accuracy: {accuracy}')

Random Forest Accuracy: 0.7060214662543205


In [72]:
def objective(trial):
    # Define the hyperparameters to tune for TfidfVectorizer
    tfidf_max_df = trial.suggest_float('tfidf_max_df', 0.7, 1.0)
    tfidf_min_df = trial.suggest_int('tfidf_min_df', 1, 5)
    ngram_range = trial.suggest_categorical('ngram_range', [(1, 1), (1, 2), (1, 3)])

    # Define the hyperparameters to tune for RandomForestClassifier
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 10, 100, log=True)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
    
    # Create the pipeline with TfidfVectorizer and RandomForestClassifier
    vectorizer = TfidfVectorizer(max_df=tfidf_max_df, min_df=tfidf_min_df, ngram_range=ngram_range)
    classifier = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42,
        n_jobs=-1  # Use all available CPUs
    )

    pipeline = make_pipeline(vectorizer, classifier)
    pipeline.fit(x_train, y_train)

    # Evaluate the pipeline on the validation set
    predictions = pipeline.predict(x_test)
    accuracy = accuracy_score(y_test, predictions)

    return accuracy

# Initialize Optuna study and optimize the objective
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print("Best trial:")
print(" Accuracy:", study.best_trial.value)
print(" Best hyperparameters:", study.best_trial.params)

[I 2024-04-03 01:06:18,857] A new study created in memory with name: no-name-7da743b6-0f71-4d0c-af16-c2c81facc1fd
[I 2024-04-03 01:06:19,479] Trial 0 finished with value: 0.4049481535382936 and parameters: {'tfidf_max_df': 0.9876726140906857, 'tfidf_min_df': 2, 'ngram_range': (1, 3), 'n_estimators': 79, 'max_depth': 37, 'min_samples_split': 16, 'min_samples_leaf': 5, 'max_features': 'log2'}. Best is trial 0 with value: 0.4049481535382936.
[I 2024-04-03 01:06:30,535] Trial 1 finished with value: 0.6629070402037475 and parameters: {'tfidf_max_df': 0.7130712377362936, 'tfidf_min_df': 5, 'ngram_range': (1, 2), 'n_estimators': 228, 'max_depth': 49, 'min_samples_split': 19, 'min_samples_leaf': 17, 'max_features': None}. Best is trial 1 with value: 0.6629070402037475.
[I 2024-04-03 01:06:31,138] Trial 2 finished with value: 0.41076951064216843 and parameters: {'tfidf_max_df': 0.8698572372754085, 'tfidf_min_df': 4, 'ngram_range': (1, 2), 'n_estimators': 258, 'max_depth': 70, 'min_samples_split

Best trial:
 Accuracy: 0.69674367837002
 Best hyperparameters: {'tfidf_max_df': 0.7488259253506983, 'tfidf_min_df': 1, 'ngram_range': (1, 2), 'n_estimators': 143, 'max_depth': 99, 'min_samples_split': 14, 'min_samples_leaf': 1, 'max_features': None}


In [73]:
best_params = study.best_params

In [76]:
# Create the pipeline using the best parameters
best_vectorizer = TfidfVectorizer(
    max_df=best_params['tfidf_max_df'],
    min_df=best_params['tfidf_min_df'],
    ngram_range=best_params['ngram_range']
)

best_classifier = RandomForestClassifier(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    max_features=best_params['max_features'],
    random_state=42,
    n_jobs=-1  # Use all available CPUs
)

final_pipeline = make_pipeline(best_vectorizer, best_classifier)

# Assuming X and y contain all your data (combining training and testing data if applicable)
# Note: Be cautious about combining train and test sets; this is only suggested if you're preparing your final model for deployment.
final_pipeline.fit(x_train, y_train)

In [79]:
joblib.dump(final_pipeline, f"{BASE_MODEL_PATH}random_forestmodel.pkl")

['./models/random_forestmodel.pkl']