# XGBoost

## Modelo 1 - Frequency of words

##### Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import feather

from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectPercentile, f_classif

#### Read FTR

In [2]:
#read ftr
train_m1 = pd.read_feather( '../dataset/train_m1.ftr', columns=None, use_threads=True)

In [3]:
train_m1 = train_m1.drop(columns=['level_0'])
train_m1.head()

Unnamed: 0,like,good,one,taste,great,coffee,flavor,tea,product,love,...,pliers,offshoot,distinctively,brilliance,auspices,synonyms,cowards,lofted,parachute,polarityClass
0,3,0,2,4,0,0,0,11,0,0,...,0,0,0,0,0,0,0,0,0,2
1,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,2
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
3,4,0,0,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2


In [4]:
#read ftr
test_m1 = pd.read_feather( '../dataset/test_m1.ftr', columns=None, use_threads=True)

In [5]:
test_m1 = test_m1.drop(columns=['level_0'])
test_m1.head()

Unnamed: 0,like,good,one,taste,great,coffee,flavor,tea,product,love,...,pliers,offshoot,distinctively,brilliance,auspices,synonyms,cowards,lofted,parachute,polarityClass
0,1,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,2
1,1,0,1,1,0,0,1,0,2,0,...,0,0,0,0,0,0,0,0,0,2
2,1,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,2
3,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
4,0,0,0,0,1,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,2


In [6]:
#get target
train_m1_target = train_m1['polarityClass']
train_m1 = train_m1.drop(columns=['polarityClass']) 

In [7]:
#get target
test_m1_target = test_m1['polarityClass']
test_m1 = test_m1.drop(columns=['polarityClass']) 

### Check for lower errors

In [None]:
error_rate = []

for i in tqdm(range(1,80)):
    clf = XGBClassifier(max_depth=i, random_state=0)
    clf.fit(train_m1,train_m1_target)
    predictions = clf.predict(test_m1)
    error_rate.append(np.mean(predictions != test_m1_target))

In [None]:
plt.figure(figsize=(20,6))
plt.plot(range(1,80),error_rate,color='blue',linestyle='dashed',marker='o',markerfacecolor='red',markersize=10)
plt.title('Error Rate vs DepthValue')
plt.xlabel("depth")
plt.ylabel("Error Rate")

### Hyperparameter tuning using GridSearchCV

In [None]:
# Create the parameter grid based on the results of random search 
parameters = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

# Create the model
xgboost_model = XGBClassifier()

# Best model
opt_model_xgb = GridSearchCV(xgboost_model, parameters,  scoring='accuracy', verbose=1)

# Fit the model
opt_model_xgb.fit(train_m1, train_m1_target)

print (opt_model_xgb.best_estimator_)

### Create and fit the model

In [None]:
# Create model with best parameters
xgboost_model = XGBClassifier()

# Fit the best model
xgboost_model.fit(train_m1, train_m1_target)
xgboost_model





In [9]:
predictions = xgboost_model.predict(test_m1)# Calculate the absolute errors

# Confusion matrix
confusion = pd.crosstab(test_m1_target,predictions, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusion )

Predicted     0    1      2    All
Actual                            
0          1177  101    763   2041
1           211  128    748   1087
2           388  258  11099  11745
All        1776  487  12610  14873


In [10]:
# Accuracy
score = accuracy_score(test_m1_target,predictions)
print('Accuracy:{0:f}'.format(score))

Accuracy:0.833994


In [11]:
# Report
print(classification_report(test_m1_target,predictions))

              precision    recall  f1-score   support

           0       0.66      0.58      0.62      2041
           1       0.26      0.12      0.16      1087
           2       0.88      0.94      0.91     11745

    accuracy                           0.83     14873
   macro avg       0.60      0.55      0.56     14873
weighted avg       0.81      0.83      0.82     14873



In [12]:
# Validate model using best model and cross validation
pecc_xgb = cross_val_score(xgboost_model, train_m1, train_m1_target, cv = 5).mean()
pecc_xgb

0.8355183354805655

## Modelo 2 - Existence of words

#### Read FTR

In [23]:
#read ftr
train_m2 = pd.read_feather( '../dataset/train_m2.ftr', columns=None, use_threads=True)

In [24]:
train_m2 = train_m2.drop(columns=['level_0'])
train_m2.head()

Unnamed: 0,like,good,one,taste,great,coffee,flavor,tea,product,love,...,pliers,offshoot,distinctively,brilliance,auspices,synonyms,cowards,lofted,parachute,polarityClass
0,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
1,1,1,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,2
2,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
3,0,1,1,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2


In [25]:
#read ftr
test_m2 = pd.read_feather( '../dataset/test_m2.ftr', columns=None, use_threads=True)

In [26]:
test_m2 = test_m2.drop(columns=['level_0'])
test_m2.head()

Unnamed: 0,like,good,one,taste,great,coffee,flavor,tea,product,love,...,pliers,offshoot,distinctively,brilliance,auspices,synonyms,cowards,lofted,parachute,polarityClass
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
1,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,2
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
3,0,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,2
4,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2


In [27]:
#get target
train_m2_target = train_m2['polarityClass']
train_m2 = train_m2.drop(columns=['polarityClass']) 

In [28]:
#get target
test_m2_target = test_m2['polarityClass']
test_m2 = test_m2.drop(columns=['polarityClass']) 

### Check for lower errors

In [None]:
error_rate = []

for i in tqdm(range(1,80)):
    clf = XGBClassifier(max_depth=i, random_state=0)
    clf.fit(train_m2,train_m2_target)
    predictions = clf.predict(test_m2)
    error_rate.append(np.mean(predictions != test_m2_target))

In [None]:
plt.figure(figsize=(20,6))
plt.plot(range(1,80),error_rate,color='blue',linestyle='dashed',marker='o',markerfacecolor='red',markersize=10)
plt.title('Error Rate vs DepthValue')
plt.xlabel("depth")
plt.ylabel("Error Rate")

### Hyperparameter tuning using GridSearchCV

In [None]:
# Create the parameter grid based on the results of random search 
parameters = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

# Create the model
xgboost_model = XGBClassifier()

# Best model
opt_model_xgb = GridSearchCV(xgboost_model, parameters,  scoring='accuracy', verbose=1)

# Fit the model
opt_model_xgb.fit(train_m2, train_m2_target)

print (opt_model_xgb.best_estimator_)

### Create and fit the model

In [29]:
# Create model with best parameters
xgboost_model = XGBClassifier()

# Fit the best model
xgboost_model.fit(train_m2, train_m2_target)
xgboost_model

MultinomialNB(alpha=1)

In [30]:
predictions = xgboost_model.predict(test_m2)# Calculate the absolute errors

# Confusion matrix
confusion = pd.crosstab(test_m2_target,predictions, rownames=['Actual'], colnames=['Predicted'], margins=True)
print(confusion )

Predicted     0    1      2    All
Actual                            
0          1121   69    837   2027
1           197  106    805   1108
2           345  174  11219  11738
All        1663  349  12861  14873


In [31]:
# Accuracy
score = accuracy_score(test_m2_target,predictions)
print('Accuracy:{0:f}'.format(score))

Accuracy:0.836818


In [32]:
# Report
print(classification_report(test_m2_target,predictions))

              precision    recall  f1-score   support

           0       0.67      0.55      0.61      2027
           1       0.30      0.10      0.15      1108
           2       0.87      0.96      0.91     11738

    accuracy                           0.84     14873
   macro avg       0.62      0.53      0.56     14873
weighted avg       0.80      0.84      0.81     14873



In [33]:
# Validate model using best model and cross validation
pecc_xgb = cross_val_score(xgboost_model, train_m2, train_m2_target, cv = 5).mean()
pecc_xgb

0.8359507117835923