### Step-1 : Importing Libraries

In [113]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from scipy.stats import uniform
import warnings
warnings.filterwarnings('ignore')

np.random_seed = 9012023

### Step-2 : Loading Data

In [114]:
data = pd.read_csv('data.csv')

In [115]:
data.head()

Unnamed: 0,_unit_id,_golden,_canary,_unit_state,_trusted_judgments,_last_judgment_at,is_the_category_correct_for_this_tweet,is_the_category_correct_for_this_tweet:confidence,orig__canary,orig__last_judgment_at,orig__trusted_judgments,orig__unit_id,orig__unit_state,gold_answer,sentiment,tweet_id,tweet_text,what_emotion_does_the_author_express_specifically_about_the_weatherconfidence
0,315120690,True,,golden,98,,Yes,1.0,,8/24/13 0:21,20,1,finalized,Yes,Positive,81990560,Grilling kabobs on the grill last night was am...,0.8439
1,315120691,True,,golden,78,,Yes,1.0,,8/24/13 0:49,20,1,finalized,Yes,Negative,84314377,The slowest day ever !! And the weather makes ...,0.6963
2,315120692,True,,golden,62,,Yes,1.0,,8/24/13 0:55,20,1,finalized,Yes,Neutral / author is just sharing information,82846118,Fire Weather Watch issued May 17 at 4:21PM CDT...,0.8802
3,315120693,True,,golden,68,,Yes,1.0,,8/24/13 0:48,20,1,finalized,Yes,Positive,82843785,Im going to lunch early today. The weather i...,0.6897
4,315120694,True,,golden,64,,Yes,0.9239,,8/24/13 1:19,20,1,finalized,Yes,Neutral / author is just sharing information,82840144,Weekend Weather Causes Delays In I-270 Bridge ...,0.6153


### Step-3: Data Exploration

In [116]:
# Assuming df is the existing DataFrame
df = data.loc[:, ['sentiment', 'tweet_text']]

In [117]:
df.head()

Unnamed: 0,sentiment,tweet_text
0,Positive,Grilling kabobs on the grill last night was am...
1,Negative,The slowest day ever !! And the weather makes ...
2,Neutral / author is just sharing information,Fire Weather Watch issued May 17 at 4:21PM CDT...
3,Positive,Im going to lunch early today. The weather i...
4,Neutral / author is just sharing information,Weekend Weather Causes Delays In I-270 Bridge ...


In [118]:
df['sentiment'].value_counts()

sentiment
Negative                                        271
Neutral / author is just sharing information    261
Tweet not related to weather condition          235
Positive                                        231
I can't tell                                      2
Name: count, dtype: int64

In [119]:
df = df[(df['sentiment'] != "I can't tell") & (df['sentiment'] != 'Tweet not related to weather condition')]

In [120]:
df['sentiment'].value_counts()

sentiment
Negative                                        271
Neutral / author is just sharing information    261
Positive                                        231
Name: count, dtype: int64

In [121]:
df['sentiment'] = df['sentiment'].replace('Neutral / author is just sharing information', 'Neutral')

In [122]:
df['sentiment'].value_counts()

sentiment
Negative    271
Neutral     261
Positive    231
Name: count, dtype: int64

In [123]:
df.head()

Unnamed: 0,sentiment,tweet_text
0,Positive,Grilling kabobs on the grill last night was am...
1,Negative,The slowest day ever !! And the weather makes ...
2,Neutral,Fire Weather Watch issued May 17 at 4:21PM CDT...
3,Positive,Im going to lunch early today. The weather i...
4,Neutral,Weekend Weather Causes Delays In I-270 Bridge ...


In [124]:
df[['tweet_text']].isna().sum()

tweet_text    0
dtype: int64

In [125]:
X = df['tweet_text']

In [126]:
y = df['sentiment']
y.unique()

array(['Positive', 'Negative', 'Neutral'], dtype=object)

#### Encoding Categorical Labels with LabelEncoder

In [127]:
labelencoder = preprocessing.LabelEncoder()
labelencoder.fit(y)
print(labelencoder.classes_)
y = labelencoder.transform(y)

['Negative' 'Neutral' 'Positive']


In [128]:
y

array([2, 0, 1, 2, 1, 2, 1, 0, 0, 2, 0, 0, 1, 0, 1, 2, 0, 0, 1, 0, 0, 2,
       1, 1, 0, 1, 2, 0, 1, 2, 1, 1, 2, 1, 1, 2, 0, 2, 0, 0, 1, 2, 1, 0,
       0, 1, 1, 1, 2, 2, 2, 2, 0, 2, 0, 0, 2, 2, 0, 0, 0, 1, 0, 0, 1, 2,
       0, 2, 1, 2, 0, 2, 2, 0, 0, 2, 0, 1, 2, 0, 1, 0, 0, 1, 1, 1, 2, 1,
       1, 2, 1, 0, 2, 0, 0, 0, 0, 0, 2, 2, 1, 0, 1, 1, 0, 2, 0, 2, 2, 2,
       0, 1, 1, 1, 0, 1, 1, 2, 2, 2, 2, 0, 1, 0, 0, 2, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 1, 0, 0, 2, 1, 0, 0, 0, 0, 2, 0, 1, 2, 1, 2, 2, 0,
       0, 2, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 2, 2, 2, 1, 1, 2, 0, 1, 1, 1,
       0, 0, 2, 0, 1, 2, 2, 0, 1, 0, 2, 1, 0, 1, 2, 0, 2, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 2, 1, 0, 1, 1, 2, 2, 1, 2, 2,
       2, 0, 2, 2, 0, 2, 0, 0, 2, 0, 1, 0, 1, 0, 2, 2, 2, 0, 1, 2, 1, 1,
       2, 1, 2, 2, 1, 1, 2, 2, 0, 2, 2, 0, 1, 2, 1, 1, 2, 1, 1, 0, 1, 1,
       2, 2, 1, 0, 0, 1, 0, 0, 1, 0, 1, 2, 2, 2, 2, 2, 1, 0, 0, 0, 1, 1,
       2, 1, 0, 1, 2, 2, 1, 0, 1, 1, 2, 1, 1, 2, 2,

### Splitting the data

In [129]:
train_data, valid_data = train_test_split(df, test_size=0.25, random_state=9012023, stratify = y)
print('Training   : ', train_data.shape)
print('Validation : ', valid_data.shape)

Training   :  (572, 2)
Validation :  (191, 2)


In [130]:
train_data.head()

Unnamed: 0,sentiment,tweet_text
897,Negative,Idk y she got all this COLD air blowing out on...
918,Neutral,New event. Severe Thunderstorm Warning from 5/...
1,Negative,The slowest day ever !! And the weather makes ...
503,Positive,Tomorrows 82 degrees ? thats what im fuckin ta...
229,Positive,Headed to Maker Faire this weekend. Weather's...


### WordNet Lemmatization

In [131]:
# Initialize the WordNet Lemmatizer
wnl = WordNetLemmatizer()

In [132]:
# Function to perform lemmatization on tweet_text
def lemmatize_tweet_text(text):
    lemmatized_text = ""
    for word, tag in pos_tag(word_tokenize(text)):
        wntag = tag[0].lower()
        wntag = wntag if wntag in ['a', 'r', 'n', 'v'] else None
        if not wntag:
            lemma = word
        else:
            lemma = wnl.lemmatize(word, wntag)
        lemmatized_text += lemma + " "
    return lemmatized_text.strip()

In [133]:
train_data['tweet_text'] = train_data['tweet_text'].apply(lemmatize_tweet_text)

In [134]:
train_data.head()

Unnamed: 0,sentiment,tweet_text
897,Negative,Idk y she get all this COLD air blow out on th...
918,Neutral,New event . Severe Thunderstorm Warning from 5...
1,Negative,The slowest day ever ! ! And the weather make ...
503,Positive,Tomorrows 82 degree ? thats what im fuckin tal...
229,Positive,Headed to Maker Faire this weekend . Weather '...


In [135]:
valid_data['tweet_text'] = valid_data['tweet_text'].apply(lemmatize_tweet_text)

In [136]:
valid_data.head()

Unnamed: 0,sentiment,tweet_text
392,Neutral,It 's windy here too . RT @ mention : Rustling...
156,Negative,"Just a head up to everyone , weather be crazy ..."
302,Positive,"Call me crazy , but I be enjoy the cool weathe..."
733,Neutral,New : : Tsunami warn EAS TSW on tv and weather...
750,Negative,My gosh ! Its too hot outside ... { link }


### Data Preparation

In [137]:
X_train = train_data['tweet_text']
y_train = train_data['sentiment']
X_test = valid_data['tweet_text']
y_test = valid_data['sentiment']

In [138]:
# the use of the TfidfVectorizer from scikit-learn to transform a collection of text documents into a matrix of TF-IDF features.

tfidf_vect = TfidfVectorizer(stop_words='english', lowercase=True, token_pattern="[^\W\d_]+")

X_train = tfidf_vect.fit_transform(X_train)

In [139]:
X_test = tfidf_vect.transform(X_test)

In [140]:
X_train.shape, X_test.shape

((572, 1683), (191, 1683))

In [141]:
## These data sets are "sparse matrix". We can't see them unless we convert using toarray()
X_train

<572x1683 sparse matrix of type '<class 'numpy.float64'>'
	with 4801 stored elements in Compressed Sparse Row format>

In [142]:
df_X_train = pd.DataFrame(X_train.toarray(), columns=tfidf_vect.get_feature_names_out())
df_X_train.head()

Unnamed: 0,aback,abc,ability,able,absolutely,abt,ac,according,accumulation,active,...,yoohoo,york,yourheart,yu,yup,yw,z,zebra,zebraprint,zoo
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.422475,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [143]:
df_X_test = pd.DataFrame(X_train.toarray(), columns=tfidf_vect.get_feature_names_out())
df_X_test.head()

Unnamed: 0,aback,abc,ability,able,absolutely,abt,ac,according,accumulation,active,...,yoohoo,york,yourheart,yu,yup,yw,z,zebra,zebraprint,zoo
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.422475,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Latent Semantic Analysis (Singular Value Decomposition)

#### Apply SVD for dimension reduction

Let's apply SVD to reduce the dimensionality of our data. 

In [144]:
svd = TruncatedSVD(n_components=50, n_iter=10) #n_components is the number of topics, which should be less than the number of features

X_train= svd.fit_transform(X_train)
X_test = svd.transform(X_test)

In [145]:
X_train.shape, X_test.shape

((572, 50), (191, 50))

## Model Fitting

### Logistic Regresion

In [146]:
lr_model = LogisticRegression()
_ = lr_model.fit(X_train, y_train)
lr_confusion_matrix = confusion_matrix(y_test, lr_model.predict(X_test))

### Evaluating Model Performance

In [147]:
y_pred_train = lr_model.predict(X_train)
lr_train_acc = accuracy_score(y_train, lr_model.predict(X_train))
print(f"Train acc: {accuracy_score(y_train, y_pred_train):.4f}")

Train acc: 0.7430


In [148]:
y_pred_test = lr_model.predict(X_test)
lr_test_acc = accuracy_score(y_test, lr_model.predict(X_test))
print(f"Test acc: {accuracy_score(y_test, y_pred_test):.4f}")

Test acc: 0.6702


In [149]:
confusion_matrix(y_test, y_pred_test)

array([[40, 14, 14],
       [ 8, 50,  7],
       [14,  6, 38]])

In [150]:
report = classification_report(y_test, y_pred_test)
print(report)

              precision    recall  f1-score   support

    Negative       0.65      0.59      0.62        68
     Neutral       0.71      0.77      0.74        65
    Positive       0.64      0.66      0.65        58

    accuracy                           0.67       191
   macro avg       0.67      0.67      0.67       191
weighted avg       0.67      0.67      0.67       191



### Hyperparameter Tuning

#### Randomised Search

In [151]:
param_dist = {
    'C': uniform(0.1, 10),             
    'penalty': ['l1', 'l2'],           
    'solver': ['liblinear', 'saga']
}

# Initialize logistic regression model
log_reg = LogisticRegression()

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=log_reg,
    param_distributions=param_dist,
    n_iter=100,  # Number of iterations
    scoring='accuracy',  # Evaluation metric
    cv=5,  # Cross-validation folds
    random_state=42
)

# Perform random search
random_search.fit(X_train, y_train)

# Get best parameters and best score
best_params = random_search.best_params_
best_score = random_search.best_score_

print("Best Parameters:", best_params)
print("Best Score (Accuracy):", best_score)

Best Parameters: {'C': 9.182658859666537, 'penalty': 'l1', 'solver': 'saga'}
Best Score (Accuracy): 0.7063157894736841


In [152]:
y_pred_test = random_search.predict(X_test)
lr_rs_test_acc = accuracy_score(y_test, random_search.predict(X_test))
print(f"Test acc: {accuracy_score(y_test, y_pred_test):.4f}")

Test acc: 0.6387


In [153]:
param_grid = {
    'C': [7,7.2,7.4,7.6,7.8],
    'penalty': [best_params['penalty']],
    'solver': [best_params['solver']]
}

# Initialize logistic regression model
log_reg = LogisticRegression()

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=log_reg,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5
)

# Perform grid search
grid_search.fit(X_train, y_train)

# Get best parameters and best score
best_params_grid = grid_search.best_params_
best_score_grid = grid_search.best_score_

print("Best Parameters (Grid Search):", best_params_grid)
print("Best Score (Accuracy) (Grid Search):", best_score_grid)

Best Parameters (Grid Search): {'C': 7.8, 'penalty': 'l1', 'solver': 'saga'}
Best Score (Accuracy) (Grid Search): 0.7028070175438595


In [154]:
y_pred_test = grid_search.predict(X_test)
lr_gs_test_acc = accuracy_score(y_test, grid_search.predict(X_test))
print(f"Test acc: {accuracy_score(y_test, y_pred_test):.4f}")

Test acc: 0.6545


### KNN Model

In [155]:
knn_model = KNeighborsClassifier()
_ = knn_model.fit(X_train, y_train)

### Evaluating Model Performance

In [156]:
y_pred_train = knn_model.predict(X_train)
knn_train_acc = accuracy_score(y_train, knn_model.predict(X_train))
print(f"Train acc: {accuracy_score(y_train, y_pred_train):.4f}")

Train acc: 0.7220


In [157]:
y_pred_test = knn_model.predict(X_test)
knn_test_acc = accuracy_score(y_test, knn_model.predict(X_test))
print(f"Test acc: {accuracy_score(y_test, y_pred_test):.4f}")

Test acc: 0.5654


In [158]:
confusion_matrix(y_test, y_pred_test)

array([[39, 22,  7],
       [13, 49,  3],
       [27, 11, 20]])

In [159]:
report = classification_report(y_test, y_pred_test)
print(report)

              precision    recall  f1-score   support

    Negative       0.49      0.57      0.53        68
     Neutral       0.60      0.75      0.67        65
    Positive       0.67      0.34      0.45        58

    accuracy                           0.57       191
   macro avg       0.59      0.56      0.55       191
weighted avg       0.58      0.57      0.55       191



### Hyperparameter Tuning

#### Randomised Search

In [160]:
from random import randint;

param_dist = {
    'n_neighbors': [randint(10,50)],      # Number of neighbors
    'weights': ['uniform', 'distance'],   # Weight function
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],  # Algorithm for computing nearest neighbors
    'leaf_size': [randint(10, 50)],         # Leaf size for tree-based algorithms
    'p': [1, 2],                          # Power parameter for Minkowski distance
    'metric': ['euclidean', 'manhattan']  # Distance metric
}

# Initialize KNN model
knn = KNeighborsClassifier()

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=knn,
    param_distributions=param_dist,
    n_iter=100,                         # Number of iterations
    scoring='accuracy',                 # Evaluation metric
    cv=5,                               # Cross-validation folds
    random_state=42
)

# Perform random grid search
random_search.fit(X_train, y_train)

# Get best parameters and best score
best_params_random = random_search.best_params_
best_score_random = random_search.best_score_

print("Best Parameters (Random Search):", best_params_random)
print("Best Score (Accuracy) (Random Search):", best_score_random)

Best Parameters (Random Search): {'weights': 'distance', 'p': 1, 'n_neighbors': 35, 'metric': 'euclidean', 'leaf_size': 22, 'algorithm': 'auto'}
Best Score (Accuracy) (Random Search): 0.6118993135011441


In [161]:
y_pred_test = random_search.predict(X_test)
lr_rs_test_acc = accuracy_score(y_test, random_search.predict(X_test))
print(f"Test acc: {accuracy_score(y_test, y_pred_test):.4f}")

Test acc: 0.5497


#### Grid Search

In [162]:
# Define grid search parameters based on best parameters from random search
param_grid = {
    'n_neighbors': [17, 18, 19],           # Number of neighbors
    'weights': ['uniform', 'distance'],    # Weight function
    'algorithm': ['brute'],                 # Algorithm for computing nearest neighbors
    'leaf_size': [19,20,21],                      # Leaf size for tree-based algorithms
    'p': [1],                               # Power parameter for Minkowski distance
    'metric': ['manhattan']                 # Distance metric
}

# Initialize KNN model
knn = KNeighborsClassifier()

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=knn,
    param_grid=param_grid,
    scoring='accuracy',    # Evaluation metric
    cv=5                   # Cross-validation folds
)

# Perform grid search
grid_search.fit(X_train, y_train)

# Get best parameters and best score
best_params_grid = grid_search.best_params_
best_score_grid = grid_search.best_score_

print("Best Parameters (Grid Search):", best_params_grid)
print("Best Score (Accuracy) (Grid Search):", best_score_grid)

Best Parameters (Grid Search): {'algorithm': 'brute', 'leaf_size': 19, 'metric': 'manhattan', 'n_neighbors': 18, 'p': 1, 'weights': 'distance'}
Best Score (Accuracy) (Grid Search): 0.6241037376048817


In [163]:
y_pred_test = grid_search.predict(X_test)
knn_test_acc = accuracy_score(y_test, grid_search.predict(X_test))
print(f"Test acc: {accuracy_score(y_test, y_pred_test):.4f}")

Test acc: 0.5654


### Support Vector Classifier

In [164]:
svm_model = SVC()
_ = svm_model.fit(X_train, y_train)

### Evaluating Model Performance

In [165]:
y_pred_train = svm_model.predict(X_train)
svm_train_acc = accuracy_score(y_train, svm_model.predict(X_train))
print(f"Train acc: {accuracy_score(y_train, y_pred_train):.4f}")

Train acc: 0.8514


In [166]:
y_pred_test = svm_model.predict(X_test)
svm_test_acc = accuracy_score(y_test, svm_model.predict(X_test))
print(f"Test acc: {accuracy_score(y_test, y_pred_test):.4f}")

Test acc: 0.6702


In [167]:
confusion_matrix(y_test, y_pred_test)

array([[42, 15, 11],
       [11, 48,  6],
       [15,  5, 38]])

In [168]:
report = classification_report(y_test, y_pred_test)
print(report)

              precision    recall  f1-score   support

    Negative       0.62      0.62      0.62        68
     Neutral       0.71      0.74      0.72        65
    Positive       0.69      0.66      0.67        58

    accuracy                           0.67       191
   macro avg       0.67      0.67      0.67       191
weighted avg       0.67      0.67      0.67       191



### Hyperparameter Tuning

#### Randomised Search

In [169]:
param_dist = {
    'C': uniform(0.1, 10),                 # Regularization parameter
    'kernel': ['linear', 'poly', 'rbf'],   # Kernel type
    'degree': [randint(2, 6)],                # Degree of polynomial kernel
    'gamma': ['scale', 'auto'],            # Kernel coefficient
    'coef0': uniform(0, 1),                 # Independent term in polynomial and sigmoid kernels
}

# Initialize SVC model
svc = SVC()

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=svc,
    param_distributions=param_dist,
    n_iter=100,                           # Number of iterations
    scoring='accuracy',                   # Evaluation metric
    cv=5,                                 # Cross-validation folds
    random_state=42
)

# Perform random grid search
random_search.fit(X_train, y_train)

# Get best parameters and best score
best_params_random = random_search.best_params_
best_score_random = random_search.best_score_

print("Best Parameters (Random Search):", best_params_random)
print("Best Score (Accuracy) (Random Search):", best_score_random)

Best Parameters (Random Search): {'C': 0.616817211686077, 'coef0': 0.531354631568148, 'degree': 6, 'gamma': 'scale', 'kernel': 'rbf'}
Best Score (Accuracy) (Random Search): 0.7028222730739893


In [170]:
y_pred_test = random_search.predict(X_test)
svm_rs_test_acc = accuracy_score(y_test, random_search.predict(X_test))
print(f"Test acc: {accuracy_score(y_test, y_pred_test):.4f}")

Test acc: 0.6806


In [171]:
# Define grid search parameters based on best parameters from random search
param_grid = {
    'C': [1.0,1.1,1.2,1.3,1.4],                   # Regularization parameter
    'kernel': ['rbf','poly'],                      # Kernel type
    'degree': [1, 2, 3],                    # Degree of polynomial kernel
    'gamma': ['scale', 'auto'],             # Kernel coefficient
    'coef0': [0.2,0.3,0.4,0.5,0.6]                # Independent term in polynomial and sigmoid kernels
}

# Initialize SVC model
svc = SVC()

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=svc,
    param_grid=param_grid,
    scoring='accuracy',    # Evaluation metric
    cv=5                   # Cross-validation folds
)

# Perform grid search
grid_search.fit(X_train, y_train)

# Get best parameters and best score
best_params_grid = grid_search.best_params_
best_score_grid = grid_search.best_score_

print("Best Parameters (Grid Search):", best_params_grid)
print("Best Score (Accuracy) (Grid Search):", best_score_grid)

Best Parameters (Grid Search): {'C': 1.3, 'coef0': 0.2, 'degree': 1, 'gamma': 'scale', 'kernel': 'rbf'}
Best Score (Accuracy) (Grid Search): 0.6957894736842105


In [172]:
y_pred_test = grid_search.predict(X_test)
svm_gs_test_acc = accuracy_score(y_test, grid_search.predict(X_test))
print(f"Test acc: {accuracy_score(y_test, y_pred_test):.4f}")

Test acc: 0.6649


### Decision Tree

In [173]:
dt_model = DecisionTreeClassifier()
_ = dt_model.fit(X_train, y_train)

### Evaluating Model Performance

In [174]:
y_pred_train = dt_model.predict(X_train)
dt_train_acc = accuracy_score(y_train, dt_model.predict(X_train))
print(f"Train acc: {accuracy_score(y_train, y_pred_train):.4f}")

Train acc: 0.9983


In [175]:
y_pred_test = dt_model.predict(X_test)
dt_test_acc = accuracy_score(y_test, dt_model.predict(X_test))
print(f"Test acc: {accuracy_score(y_test, y_pred_test):.4f}")

Test acc: 0.5497


In [176]:
confusion_matrix(y_test, y_pred_test)

array([[31, 18, 19],
       [10, 44, 11],
       [12, 16, 30]])

In [177]:
report = classification_report(y_test, y_pred_test)
print(report)

              precision    recall  f1-score   support

    Negative       0.58      0.46      0.51        68
     Neutral       0.56      0.68      0.62        65
    Positive       0.50      0.52      0.51        58

    accuracy                           0.55       191
   macro avg       0.55      0.55      0.55       191
weighted avg       0.55      0.55      0.55       191



### Hyperparameter Tuning

#### Randomised Search

In [178]:
# Define hyperparameter distributions for random search
param_dist = {
    'criterion': ['gini', 'entropy'],               # Split criterion
    'splitter': ['best', 'random'],                 # Strategy for splitting nodes
    'max_depth': [randint(2, 20)],                     # Maximum depth of the tree
    'min_samples_split': [randint(2, 20)],             # Minimum number of samples required to split a node
    'min_samples_leaf': [randint(1, 10)],              # Minimum number of samples required at each leaf node
    'max_features': ['auto', 'sqrt', 'log2', None], # Number of features to consider for best split
    'max_leaf_nodes': [randint(2, 50)],                # Maximum number of leaf nodes
}

# Initialize Decision Tree classifier
dt_classifier = DecisionTreeClassifier()

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=dt_classifier,
    param_distributions=param_dist,
    n_iter=100,             # Number of iterations
    scoring='accuracy',     # Evaluation metric
    cv=5,                   # Cross-validation folds
    random_state=42
)

# Perform random grid search
random_search.fit(X_train, y_train)

# Get best parameters and best score
best_params_random = random_search.best_params_
best_score_random = random_search.best_score_

print("Best Parameters (Random Search):", best_params_random)
print("Best Score (Accuracy) (Random Search):", best_score_random)

Best Parameters (Random Search): {'splitter': 'best', 'min_samples_split': 16, 'min_samples_leaf': 8, 'max_leaf_nodes': 38, 'max_features': None, 'max_depth': 20, 'criterion': 'gini'}
Best Score (Accuracy) (Random Search): 0.5804729214340199


In [179]:
y_pred_test = random_search.predict(X_test)
dt_rs_test_acc = accuracy_score(y_test, random_search.predict(X_test))
print(f"Test acc: {accuracy_score(y_test, y_pred_test):.4f}")

Test acc: 0.5812


In [180]:
# Define grid search parameters around the best parameters
param_grid = {
    'criterion': ['entropy'],
    'splitter': ['best'],
    'max_depth': [10, 11, 12],
    'min_samples_split': [7, 8, 9],
    'min_samples_leaf': [2, 3, 4],
    'max_features': [None],
    'max_leaf_nodes': [13, 14, 15]
}

# Initialize Decision Tree classifier
dt_classifier = DecisionTreeClassifier()

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=dt_classifier,
    param_grid=param_grid,
    scoring='accuracy',    # Evaluation metric
    cv=5                   # Cross-validation folds
)

# Perform grid search
grid_search.fit(X_train, y_train)

# Get best parameters and best score
best_params_grid = grid_search.best_params_
best_score_grid = grid_search.best_score_

print("Best Parameters (Grid Search):", best_params_grid)
print("Best Score (Accuracy) (Grid Search):", best_score_grid)

Best Parameters (Grid Search): {'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'max_leaf_nodes': 14, 'min_samples_leaf': 2, 'min_samples_split': 7, 'splitter': 'best'}
Best Score (Accuracy) (Grid Search): 0.5996491228070175


In [181]:
y_pred_test = grid_search.predict(X_test)
dt_gs_test_acc = accuracy_score(y_test, grid_search.predict(X_test))
print(f"Test acc: {accuracy_score(y_test, y_pred_test):.4f}")

Test acc: 0.6440


### Random Forest

In [182]:
rnd_clf = RandomForestClassifier(n_jobs = -1) 
_ = rnd_clf.fit(X_train, y_train)

### Evaluating Model Performance

In [183]:
y_pred_train = rnd_clf.predict(X_train)
acc = accuracy_score(y_train, y_pred_train)
print(f"Train acc: {accuracy_score(y_train, y_pred_train):.4f}")

Train acc: 0.9983


In [184]:
y_pred_test = rnd_clf.predict(X_test)
acc = accuracy_score(y_test, y_pred_test)
print(f"Test acc: {accuracy_score(y_test, y_pred_test):.4f}")

Test acc: 0.6126


In [185]:
confusion_matrix(y_test, y_pred_test)

array([[39, 15, 14],
       [11, 49,  5],
       [18, 11, 29]])

In [186]:
report = classification_report(y_test, y_pred_test)
print(report)

              precision    recall  f1-score   support

    Negative       0.57      0.57      0.57        68
     Neutral       0.65      0.75      0.70        65
    Positive       0.60      0.50      0.55        58

    accuracy                           0.61       191
   macro avg       0.61      0.61      0.61       191
weighted avg       0.61      0.61      0.61       191



### Hyperparameter Tuning

#### Randomised Search

In [187]:
# Define hyperparameter distributions for random search
param_dist = {
    'max_features': ['auto', 'sqrt', 'log2'],      # Number of features to consider at every split
    'max_depth': [None, 10, 20, 30, 40, 50],       # Maximum depth of the trees
    'min_samples_split': [randint(2, 10)],            # Minimum number of samples required to split a node
    'min_samples_leaf': [randint(1, 10)],             # Minimum number of samples required at each leaf node
    'bootstrap': [True, False]                      # Bootstrap samples when building trees
}

# Initialize Random Forest Classifier
rf_classifier = RandomForestClassifier()

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rf_classifier,
    param_distributions=param_dist,
    n_iter=100,                           # Number of iterations
    scoring='accuracy',                   # Evaluation metric
    cv=5,                                 # Cross-validation folds
    random_state=42
)

# Perform random search
random_search.fit(X_train, y_train)

# Get best parameters and best score
best_params_random = random_search.best_params_
best_score_random = random_search.best_score_

print("Best Parameters (Random Search):", best_params_random)
print("Best Score (Accuracy) (Random Search):", best_score_random)


Best Parameters (Random Search): {'min_samples_split': 8, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 30, 'bootstrap': False}
Best Score (Accuracy) (Random Search): 0.7010221205186881


In [188]:
y_pred_test = random_search.predict(X_test)
rm_rs_test_acc = accuracy_score(y_test, random_search.predict(X_test))
print(f"Test acc: {accuracy_score(y_test, y_pred_test):.4f}")

Test acc: 0.6492


In [189]:
# Define grid search parameters around the best parameters
param_grid = {
    'bootstrap': [False],                    # Bootstrap samples when building trees
    'max_depth': [30, 35, 40, 45, 50],       # Maximum depth of the trees
    'max_features': ['sqrt'],                # Number of features to consider at every split
    'min_samples_leaf': [2, 3, 4],           # Minimum number of samples required at each leaf node
    'min_samples_split': [7, 8, 9]           # Minimum number of samples required to split a node
}

# Initialize Random Forest Classifier
rf_classifier = RandomForestClassifier()

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=rf_classifier,
    param_grid=param_grid,
    scoring='accuracy',    # Evaluation metric
    cv=5                   # Cross-validation folds
)

# Perform grid search
grid_search.fit(X_train, y_train)

# Get best parameters and best score
best_params_grid = grid_search.best_params_
best_score_grid = grid_search.best_score_

print("Best Parameters (Grid Search):", best_params_grid)
print("Best Score (Accuracy) (Grid Search):", best_score_grid)

Best Parameters (Grid Search): {'bootstrap': False, 'max_depth': 45, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 8}
Best Score (Accuracy) (Grid Search): 0.7010526315789474


In [190]:
y_pred_test = grid_search.predict(X_test)
rm_gs_test_acc = accuracy_score(y_test, grid_search.predict(X_test))
print(f"Test acc: {accuracy_score(y_test, y_pred_test):.4f}")

Test acc: 0.6126


### AdaBoost Model Training

In [191]:
adaboost_model = AdaBoostClassifier()
_ = adaboost_model.fit(X_train, y_train)

In [192]:
y_pred_train = adaboost_model.predict(X_train)
ada_acc = accuracy_score(y_train, y_pred_train)
print(f"Train acc: {accuracy_score(y_train, y_pred_train):.4f}")

Train acc: 0.7850


In [193]:
y_pred_test = adaboost_model.predict(X_test)
ada_acc = accuracy_score(y_test, y_pred_test)
print(f"Test acc: {accuracy_score(y_test, y_pred_test):.4f}")

Test acc: 0.6073


In [194]:
confusion_matrix(y_test, y_pred_test)

array([[37, 18, 13],
       [10, 50,  5],
       [21,  8, 29]])

In [195]:
report = classification_report(y_test, y_pred_test)
print(report)

              precision    recall  f1-score   support

    Negative       0.54      0.54      0.54        68
     Neutral       0.66      0.77      0.71        65
    Positive       0.62      0.50      0.55        58

    accuracy                           0.61       191
   macro avg       0.61      0.60      0.60       191
weighted avg       0.60      0.61      0.60       191



### Hyperparameter Tuning

#### Randomised Search

In [196]:
# Define hyperparameter distributions for random search
param_dist = {
    'learning_rate': uniform(0.1, 1.0),      # Learning rate shrinks the contribution of each classifier
    'algorithm': ['SAMME', 'SAMME.R']         # Algorithm to use for boosting
}

# Initialize AdaBoost Classifier
ada_classifier = AdaBoostClassifier()

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=ada_classifier,
    param_distributions=param_dist,
    n_iter=100,                           # Number of iterations
    scoring='accuracy',                   # Evaluation metric
    cv=5,                                 # Cross-validation folds
    random_state=42
)

# Perform random search
random_search.fit(X_train, y_train)

# Get best parameters and best score
best_params_random = random_search.best_params_
best_score_random = random_search.best_score_

print("Best Parameters (Random Search):", best_params_random)
print("Best Score (Accuracy) (Random Search):", best_score_random)

Best Parameters (Random Search): {'algorithm': 'SAMME', 'learning_rate': 0.9287375091519293}
Best Score (Accuracy) (Random Search): 0.6608237986270022


In [197]:
y_pred_test = random_search.predict(X_test)
ada_rs_acc = accuracy_score(y_test, y_pred_test)
print(f"Test acc: {accuracy_score(y_test, y_pred_test):.4f}")

Test acc: 0.6335


#### Grid Search

In [198]:
# Define grid search parameters around the best parameters
param_grid = {
    'algorithm': ['SAMME'],                  # Algorithm to use for boosting
    'learning_rate': [0.7, 0.75, 0.8, 0.85],  # Learning rate shrinks the contribution of each classifier
}

# Initialize AdaBoost Classifier
ada_classifier = AdaBoostClassifier()

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=ada_classifier,
    param_grid=param_grid,
    scoring='accuracy',    # Evaluation metric
    cv=5                   # Cross-validation folds
)

# Perform grid search
grid_search.fit(X_train, y_train)

# Get best parameters and best score
best_params_grid = grid_search.best_params_
best_score_grid = grid_search.best_score_

print("Best Parameters (Grid Search):", best_params_grid)
print("Best Score (Accuracy) (Grid Search):", best_score_grid)

Best Parameters (Grid Search): {'algorithm': 'SAMME', 'learning_rate': 0.75}
Best Score (Accuracy) (Grid Search): 0.6433104500381388


In [199]:
y_pred_test = grid_search.predict(X_test)
ada_gs_acc = accuracy_score(y_test, y_pred_test)
print(f"Test acc: {accuracy_score(y_test, y_pred_test):.4f}")

Test acc: 0.6230


### XG Boost

In [200]:
xgb_model = xgb.XGBClassifier()
_ = xgb_model.fit(X_train, y_train)

ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2], got ['Negative' 'Neutral' 'Positive']

In [None]:
y_pred_train = xgb_model.predict(X_train)
xg_acc = accuracy_score(y_train, y_pred_train)
print(f"Train acc: {accuracy_score(y_train, y_pred_train):.4f}")

Train acc: 0.9983


In [None]:
y_pred_test = xgb_model.predict(X_test)
xg_acc = accuracy_score(y_test, y_pred_test)
print(f"Test acc: {accuracy_score(y_test, y_pred_test):.4f}")

Test acc: 0.6335


In [None]:
confusion_matrix(y_test, y_pred_test)

array([[39, 16, 13],
       [10, 47,  8],
       [14,  9, 35]], dtype=int64)

In [None]:
report = classification_report(y_test, y_pred_test)
print(report)

              precision    recall  f1-score   support

    Negative       0.62      0.57      0.60        68
     Neutral       0.65      0.72      0.69        65
    Positive       0.62      0.60      0.61        58

    accuracy                           0.63       191
   macro avg       0.63      0.63      0.63       191
weighted avg       0.63      0.63      0.63       191

