In [44]:
import pandas as pd
from pandas import option_context
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
import numpy as np

# Data Loading and Prep

In [5]:
df = pd.read_csv("/kaggle/input/zindi-to-vaccinate-or-not/Train (1).csv")
text_col = "safe_text"
label_col = "label"

# Later, simpletransformers won't work with str() classes. So let's convert them
# to integers as follows.
class_dict = {'negative': -1, 'neutral': 0 ,'positive': 1}


#df[label_col] = df["airline_sentiment"].apply(lambda x: class_dict.get(x))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10001 entries, 0 to 10000
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   tweet_id   10001 non-null  object 
 1   safe_text  10001 non-null  object 
 2   label      10000 non-null  float64
 3   agreement  9999 non-null   float64
dtypes: float64(2), object(2)
memory usage: 312.7+ KB


In [6]:
df["label"].value_counts()

label
 0.000000    4908
 1.000000    4053
-1.000000    1038
 0.666667       1
Name: count, dtype: int64

In [7]:
df.dropna(inplace=True)

In [8]:
df[label_col].value_counts()

label
 0.0    4908
 1.0    4053
-1.0    1038
Name: count, dtype: int64

In [9]:
with option_context('display.max_colwidth', None):
  display(df[[text_col, label_col]].head(10))

Unnamed: 0,safe_text,label
0,Me &amp; The Big Homie meanboy3000 #MEANBOY #MB #MBS #MMR #STEGMANLIFE @ Stegman St. <url>,0.0
1,I'm 100% thinking of devoting my career to proving autism isn't caused by vaccines due to the IDIOTIC posts I've seen about World Autism Day,1.0
2,"#whatcausesautism VACCINES, DO NOT VACCINATE YOUR CHILD",-1.0
3,"I mean if they immunize my kid with something that won't secretly kill him years down the line then I'm all for it, but I don't trust that",-1.0
4,Thanks to <user> Catch me performing at La Nuit NYC 1134 1st ave. Show starts at 6! #jennifair #mmr… <url>,0.0
5,<user> a nearly 67 year old study when mental health studies and vaccines were relatively in their infancies that has been refuted?,1.0
6,"Study of more than 95,000 kids finds no link between MMR vaccine and autism <url>",1.0
7,psa: VACCINATE YOUR FUCKING KIDS,1.0
8,Coughing extra on the shuttle and everyone thinks I have the measles. 😂 #VaccinateYourKids,1.0
9,AIDS vaccine created at Oregon Health &amp; Science University may clear virus from body - <url> <url>,1.0


In [10]:
X = df[text_col]
y = df[label_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

In [11]:
X_train.info()
X_test.info()

<class 'pandas.core.series.Series'>
Index: 6699 entries, 9827 to 3328
Series name: safe_text
Non-Null Count  Dtype 
--------------  ----- 
6699 non-null   object
dtypes: object(1)
memory usage: 104.7+ KB
<class 'pandas.core.series.Series'>
Index: 3300 entries, 8163 to 9476
Series name: safe_text
Non-Null Count  Dtype 
--------------  ----- 
3300 non-null   object
dtypes: object(1)
memory usage: 51.6+ KB


In [12]:
X_test.info()

<class 'pandas.core.series.Series'>
Index: 3300 entries, 8163 to 9476
Series name: safe_text
Non-Null Count  Dtype 
--------------  ----- 
3300 non-null   object
dtypes: object(1)
memory usage: 51.6+ KB


In [13]:
y_test.value_counts()

label
 0.0    1620
 1.0    1338
-1.0     342
Name: count, dtype: int64

# Shallow ML Sentiment Analysis with TF-IDF and RF

In [14]:
vectorizer = TfidfVectorizer(min_df=.01, max_df=.8, ngram_range=(1,3), max_features=200, stop_words=None)
clf = RandomForestClassifier(max_depth=None, n_estimators=1000, min_samples_leaf=10, random_state=0)

In [15]:
vectorizer = vectorizer.fit(X_train)
X_vec_train = vectorizer.transform(X_train)

In [16]:
clf.fit(X_vec_train, y_train)

In [17]:
X_vec_test = vectorizer.transform(X_test)
y_preds_train_2 = clf.predict(X_vec_train)
y_preds_test_2 = clf.predict(X_vec_test)

In [18]:
print(confusion_matrix(y_train, y_preds_train_2))
print(classification_report(y_train, y_preds_train_2))

[[  23  148  525]
 [   4 2752  532]
 [   1  422 2292]]
              precision    recall  f1-score   support

        -1.0       0.82      0.03      0.06       696
         0.0       0.83      0.84      0.83      3288
         1.0       0.68      0.84      0.76      2715

    accuracy                           0.76      6699
   macro avg       0.78      0.57      0.55      6699
weighted avg       0.77      0.76      0.72      6699



In [19]:
print(confusion_matrix(y_test, y_preds_test_2))
print(classification_report(y_test, y_preds_test_2))

[[   4   88  250]
 [   0 1284  336]
 [   1  280 1057]]
              precision    recall  f1-score   support

        -1.0       0.80      0.01      0.02       342
         0.0       0.78      0.79      0.78      1620
         1.0       0.64      0.79      0.71      1338

    accuracy                           0.71      3300
   macro avg       0.74      0.53      0.51      3300
weighted avg       0.73      0.71      0.68      3300



# Shallow ML Sentiment Analysis with TF-IDF and Xgboost

In [21]:
import xgboost as xgb 

In [22]:
from sklearn.model_selection import cross_val_score

In [23]:
%pip install optuna
import optuna

Note: you may need to restart the kernel to use updated packages.


In [26]:
#xgb with default parameters
clf_xgb=xgb.XGBClassifier()

In [28]:
# mapping the labels. -1:negative:0, 0:neutral:1,1:positive:2
clf_dict={-1:0,0:1,1:2}

In [29]:
y_train_2=y_train.map(clf_dict)

In [39]:
y_test_2=y_test.map(clf_dict)

In [31]:
y_train_2.value_counts()

label
1    3288
2    2715
0     696
Name: count, dtype: int64

In [32]:
clf_xgb.fit(X_vec_train, y_train_2)

In [35]:
y_preds_train_xgb = clf_xgb.predict(X_vec_train)
y_preds_test_xgb = clf_xgb.predict(X_vec_test)

In [36]:
print(confusion_matrix(y_train_2, y_preds_train_xgb))
print(classification_report(y_train_2, y_preds_train_xgb))

[[ 543   53  100]
 [   5 3139  144]
 [   4  223 2488]]
              precision    recall  f1-score   support

           0       0.98      0.78      0.87       696
           1       0.92      0.95      0.94      3288
           2       0.91      0.92      0.91      2715

    accuracy                           0.92      6699
   macro avg       0.94      0.88      0.91      6699
weighted avg       0.92      0.92      0.92      6699



In [40]:
print(confusion_matrix(y_test_2, y_preds_test_xgb))
print(classification_report(y_test_2, y_preds_test_xgb))

[[  53   98  191]
 [  37 1280  303]
 [  65  301  972]]
              precision    recall  f1-score   support

           0       0.34      0.15      0.21       342
           1       0.76      0.79      0.78      1620
           2       0.66      0.73      0.69      1338

    accuracy                           0.70      3300
   macro avg       0.59      0.56      0.56      3300
weighted avg       0.68      0.70      0.68      3300



In [41]:
# hypertuning
def objective_dt(trial, X, y):

  # Now, define all the hyperparams we want to vary, and what values they are allowed
  # to take.
  #
  # Each trial, optuna will automatically choose values for each hyperparam.
  hyper_params = {
        "objective":'multi:softmax',
        "n_estimators":1500,
        "max_depth":trial.suggest_int("max_depth",1,100),
        "max_bin":trial.suggest_int("max_bin",10,500),
        "gamma":trial.suggest_float("gamma",0,100),
        "min_child_weight":trial.suggest_float("min_child_weight",0,100),
        'lambda': trial.suggest_float('lambda', 0, 100),
        'alpha': trial.suggest_float('alpha', 0, 100 ),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0, 1.0),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 0.01,0.1),
        "subsample":trial.suggest_float('subsample', 0, 1.0),
        "grow_policy":trial.suggest_categorical("grow_policy",["depthwise","lossguide"]),
        "max_delta_step":trial.suggest_float('max_delta_step', 0, 100 ),
        #"max_leaves":trial.suggest_int("max_leaves",0,100,step=1),
        "random_state": 70,

  }


  # Use the hyperparams that optuna has chosen for this trial to create a DecisionTreeClassifier
  clf_xgb = xgb.XGBClassifier(**hyper_params,n_jobs=10)

  # Run CV to see how well these hyper_params do
  cv_scores = cross_val_score(clf_xgb, X, y, cv=5, scoring="accuracy")
  score = np.mean(cv_scores)

  # Whatever we return here tells optuna how well these parameters did
  return score

In [45]:
study = optuna.create_study(direction="maximize")
study.optimize(lambda trial: objective_dt(trial, X_vec_train, y_train_2), n_trials=100,  gc_after_trial=True)

[I 2024-04-07 03:02:47,626] A new study created in memory with name: no-name-fc626a6b-edac-4c72-8bc3-c5b56663bccb
[I 2024-04-07 03:02:57,854] Trial 0 finished with value: 0.49081950219031806 and parameters: {'max_depth': 72, 'max_bin': 243, 'gamma': 42.133102483496266, 'min_child_weight': 87.35879980661674, 'lambda': 93.78558299254219, 'alpha': 24.077694105923143, 'colsample_bytree': 0.8454177925097529, 'colsample_bylevel': 0.05300375137476809, 'colsample_bynode': 0.5030981443559415, 'learning_rate': 0.08483081173969445, 'subsample': 0.13095532477664173, 'grow_policy': 'lossguide', 'max_delta_step': 37.49785805059328}. Best is trial 0 with value: 0.49081950219031806.
[I 2024-04-07 03:03:05,897] Trial 1 finished with value: 0.513506069354497 and parameters: {'max_depth': 10, 'max_bin': 28, 'gamma': 80.25349314764631, 'min_child_weight': 39.48541935907512, 'lambda': 53.060511314072436, 'alpha': 60.444725752026784, 'colsample_bytree': 0.1521232601629534, 'colsample_bylevel': 0.09115052669

In [46]:
study.best_params

{'max_depth': 47,
 'max_bin': 472,
 'gamma': 0.002484719284343573,
 'min_child_weight': 0.1831800231653462,
 'lambda': 70.6201285187804,
 'alpha': 86.08858906649738,
 'colsample_bytree': 0.9726389134266825,
 'colsample_bylevel': 0.23346244044307618,
 'colsample_bynode': 0.668848379047956,
 'learning_rate': 0.0549077283839125,
 'subsample': 0.9128580482478246,
 'grow_policy': 'lossguide',
 'max_delta_step': 5.507941870477165}

In [47]:
clf_xgb_tuned=xgb.XGBClassifier(**study.best_params)

In [48]:
clf_xgb_tuned.fit(X_vec_train, y_train_2)

In [49]:
y_preds_train_xgb_tuned = clf_xgb_tuned.predict(X_vec_train)
y_preds_test_xgb_tuned = clf_xgb_tuned.predict(X_vec_test)

In [50]:
print(confusion_matrix(y_train_2, y_preds_train_xgb_tuned))
print(classification_report(y_train_2, y_preds_train_xgb_tuned))

[[   9  133  554]
 [   1 2702  585]
 [   0  325 2390]]
              precision    recall  f1-score   support

           0       0.90      0.01      0.03       696
           1       0.86      0.82      0.84      3288
           2       0.68      0.88      0.77      2715

    accuracy                           0.76      6699
   macro avg       0.81      0.57      0.54      6699
weighted avg       0.79      0.76      0.72      6699



In [51]:
print(confusion_matrix(y_test_2, y_preds_test_xgb_tuned))
print(classification_report(y_test_2, y_preds_test_xgb_tuned))

[[   1   73  268]
 [   0 1237  383]
 [   1  242 1095]]
              precision    recall  f1-score   support

           0       0.50      0.00      0.01       342
           1       0.80      0.76      0.78      1620
           2       0.63      0.82      0.71      1338

    accuracy                           0.71      3300
   macro avg       0.64      0.53      0.50      3300
weighted avg       0.70      0.71      0.67      3300



## LightGBM Classifier

In [52]:
from lightgbm import LGBMClassifier

In [53]:
clf_lgbm=LGBMClassifier()

In [54]:
clf_lgbm.fit(X_vec_train, y_train_2)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012462 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16661
[LightGBM] [Info] Number of data points in the train set: 6699, number of used features: 200
[LightGBM] [Info] Start training from score -2.264364
[LightGBM] [Info] Start training from score -0.711679
[LightGBM] [Info] Start training from score -0.903166


In [55]:
y_preds_train_lgbm = clf_lgbm.predict(X_vec_train)
y_preds_test_lgbm = clf_lgbm.predict(X_vec_test)

In [56]:
print(confusion_matrix(y_train_2, y_preds_train_lgbm))
print(classification_report(y_train_2, y_preds_train_lgbm))

[[ 525   68  103]
 [  22 3036  230]
 [  10  250 2455]]
              precision    recall  f1-score   support

           0       0.94      0.75      0.84       696
           1       0.91      0.92      0.91      3288
           2       0.88      0.90      0.89      2715

    accuracy                           0.90      6699
   macro avg       0.91      0.86      0.88      6699
weighted avg       0.90      0.90      0.90      6699



In [57]:
print(confusion_matrix(y_test_2, y_preds_test_lgbm))
print(classification_report(y_test_2, y_preds_test_lgbm))

[[  72   90  180]
 [  40 1283  297]
 [  63  284  991]]
              precision    recall  f1-score   support

           0       0.41      0.21      0.28       342
           1       0.77      0.79      0.78      1620
           2       0.68      0.74      0.71      1338

    accuracy                           0.71      3300
   macro avg       0.62      0.58      0.59      3300
weighted avg       0.70      0.71      0.70      3300



In [58]:
def objective_dt2(trial, X, y):

  # Now, define all the hyperparams we want to vary, and what values they are allowed
  # to take.
  #
  # Each trial, optuna will automatically choose values for each hyperparam.
  hyper_params = {
        "objective":'multiclass',
        "n_estimators":1000,
       #"max_depth":trial.suggest_int("max_depth",8,16,step=2),
        "learning_rate":trial.suggest_float("learning_rate",0.01,.19),
        "max_bin":trial.suggest_int("max_bin",2,500),
        "min_data_in_bin":trial.suggest_int("min_data_in_bin",1,200),
        #"data_sample_strategy":trial.suggest_categorical("data_sample_strategy",["bagging","goss"]),
        "num_leaves":trial.suggest_int("num_leaves",1,10000),
        "num_leaves":trial.suggest_int("num_leaves",1,10000),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 20, 100),
        #"tree_learner":trial.suggest_categorical("tree_learner",["serial","feature","data","voting"]),
        #"min_sum_hessian_in_leaf":trial.suggest_float("min_sum_hessian_in_leaf",1e-3,1),
        #"feature_fraction_bynode":trial.suggest_float("feature_fraction_bynode",0.1,1),
        #"max_delta_step":trial.suggest_float("max_delta_step",0,1),
        'lambda_l1': trial.suggest_float('lambda_l1', 0, 10),
        'lambda_l2': trial.suggest_float('lambda_l2', 0, 10),
        'linear_lambda': trial.suggest_float('linear_lambda', 0, 10),
        #"cegb_tradeoff":trial.suggest_float("cegb_tradeoff", 0, 10),
        #"cegb_penalty_split":trial.suggest_float("cegb_penalty_split", 0, 10),
        "verbose":-1,
        "num_threads":2,
        "random_state": 60,

    }


  # Use the hyperparams that optuna has chosen for this trial to create a DecisionTreeClassifier
  clf_lfgm = LGBMClassifier(**hyper_params)

  # Run CV to see how well these hyper_params do
  cv_scores = cross_val_score(clf_lfgm, X, y, cv=5, scoring="accuracy")
  score = np.mean(cv_scores)

  # Whatever we return here tells optuna how well these parameters did
  return score


In [59]:
study2 = optuna.create_study(direction="maximize")
study2.optimize(lambda trial: objective_dt(trial, X_vec_train, y_train_2), n_trials=100,  gc_after_trial=True)

[I 2024-04-07 03:56:19,506] A new study created in memory with name: no-name-1a5e079b-ae1d-4f88-a373-5401184bf8d2
[I 2024-04-07 03:56:32,471] Trial 0 finished with value: 0.49081950219031806 and parameters: {'max_depth': 51, 'max_bin': 491, 'gamma': 52.101242575826255, 'min_child_weight': 74.61227595596193, 'lambda': 93.13944205969989, 'alpha': 16.38939177930293, 'colsample_bytree': 0.7258420683255027, 'colsample_bylevel': 0.19686889232794025, 'colsample_bynode': 0.8668674351426264, 'learning_rate': 0.024271974641515336, 'subsample': 0.031115548900598644, 'grow_policy': 'lossguide', 'max_delta_step': 52.23634280704134}. Best is trial 0 with value: 0.49081950219031806.
[I 2024-04-07 03:56:46,254] Trial 1 finished with value: 0.6608475917648502 and parameters: {'max_depth': 85, 'max_bin': 397, 'gamma': 32.73178156273276, 'min_child_weight': 18.20540707195235, 'lambda': 29.09711104923072, 'alpha': 22.151036744258214, 'colsample_bytree': 0.24522976328784596, 'colsample_bylevel': 0.44661523

In [60]:
study2.best_params

{'max_depth': 58,
 'max_bin': 415,
 'gamma': 7.787810284984652,
 'min_child_weight': 29.719076900252325,
 'lambda': 4.903053632780221,
 'alpha': 5.674442708228039,
 'colsample_bytree': 0.21574228652948663,
 'colsample_bylevel': 0.23640247488240732,
 'colsample_bynode': 0.7501142112323974,
 'learning_rate': 0.05003405863802013,
 'subsample': 0.6992675236928173,
 'grow_policy': 'lossguide',
 'max_delta_step': 27.18284421339491}

In [61]:
clf_lgbm_tuned=LGBMClassifier(**study2.best_params)

In [62]:
clf_lgbm_tuned.fit(X_vec_train, y_train_2)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011693 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 18098
[LightGBM] [Info] Number of data points in the train set: 6699, number of used features: 200
[LightGBM] [Info] Start training from score -2.264364
[LightGBM] [Info] Start training from score -0.711679
[LightGBM] [Info] Start training from score -0.903166


In [63]:
y_preds_train_lgbm_tuned = clf_lgbm_tuned.predict(X_vec_train)
y_preds_test_lgbm_tuned = clf_lgbm_tuned.predict(X_vec_test)



In [64]:
print(confusion_matrix(y_train_2, y_preds_train_lgbm_tuned))
print(classification_report(y_train_2, y_preds_train_lgbm_tuned))

[[  32  206  458]
 [   7 2809  472]
 [   4  584 2127]]
              precision    recall  f1-score   support

           0       0.74      0.05      0.09       696
           1       0.78      0.85      0.82      3288
           2       0.70      0.78      0.74      2715

    accuracy                           0.74      6699
   macro avg       0.74      0.56      0.55      6699
weighted avg       0.74      0.74      0.71      6699



In [65]:
print(confusion_matrix(y_test_2, y_preds_test_lgbm_tuned))
print(classification_report(y_test_2, y_preds_test_lgbm_tuned))

[[   9  111  222]
 [   3 1337  280]
 [   5  338  995]]
              precision    recall  f1-score   support

           0       0.53      0.03      0.05       342
           1       0.75      0.83      0.79      1620
           2       0.66      0.74      0.70      1338

    accuracy                           0.71      3300
   macro avg       0.65      0.53      0.51      3300
weighted avg       0.69      0.71      0.68      3300

