In [19]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

In [20]:
# import baseline, news data sets for classification via xgboost
baseline = pd.read_csv('../../data/baseline_features.csv')
news = pd.read_csv('../../data/news_features.csv')
baseline.head(1)

Unnamed: 0,source,headline,headline_processed,headline_length,unique_words,buzzword_frequency,word_length,word_length_processed,upper_words,sensational
0,Associated Press,Israeli airstrikes on Gaza resume after weeklo...,isra airstrik gaza resum weeklong truce hama end,11,0.693147,0.0,5.454545,5.125,0.241162,0


In [21]:
# declare feature, target variables
X = baseline.drop(columns=['source','headline','headline_processed','sensational'])
y = baseline['sensational']

In [22]:
# split data into train, test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [23]:
# standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [24]:
# create randomized search hyperparameter options
param_grid = {'max_depth':[3, 6, 9],
              'min_child_weight':[1],
              'gamma':[0, 0.1, 0.2],
              'subsample':[0.5, 0.7, 0.9],
              'colsample_bytree':[0.5, 0.7, 0.9],
              'scale_pos_weight': [1]
           }

In [25]:
# create randomized search to find best hyper parameters
xgbc = XGBClassifier()
search = RandomizedSearchCV(xgbc, param_grid, cv=5, scoring='accuracy') 
search.fit(X_train, y_train)

In [26]:
# get best randomized search hyperparamters
best_params = search.best_params_
print(best_params)

{'subsample': 0.7, 'scale_pos_weight': 1, 'min_child_weight': 1, 'max_depth': 9, 'gamma': 0.1, 'colsample_bytree': 0.9}


In [27]:
# create grid search to find best hyper parameters
param_grid = {'max_depth':[8, 9, 10],
              'min_child_weight':[1],
              'gamma':[0.05, 0.1, 0.15],
              'subsample':[0.6, 0.7, 0.8],
              'colsample_bytree':[0.8, 0.9, 1.0],
              'scale_pos_weight': [1]
           }

In [28]:
# create randomized search to find best hyper parameters
xgbc = XGBClassifier()
search = GridSearchCV(xgbc, param_grid, cv=5, scoring='accuracy') 
search.fit(X_train, y_train)

In [29]:
# get best grid search hyperparamters
best_params = search.best_params_
print(best_params)

{'colsample_bytree': 0.8, 'gamma': 0.15, 'max_depth': 10, 'min_child_weight': 1, 'scale_pos_weight': 1, 'subsample': 0.7}


In [30]:
# fit the model with the best hyperparameters 
xgbc_best = XGBClassifier(**best_params) 
xgbc_best.fit(X_train, y_train)

# make predictions 
y_pred = xgbc_best.predict(X_test)

In [31]:
# evaluate model using accuracy, precision, recall
accuracy = accuracy_score(y_test, y_pred) 
conf_matrix = confusion_matrix(y_test, y_pred) 
classification_rep = classification_report(y_test, y_pred) 
print("Accuracy:", accuracy) 
print("Confusion Matrix:\n", conf_matrix) 
print("Classification Report:\n", classification_rep)

Accuracy: 0.8181818181818182
Confusion Matrix:
 [[32  5]
 [ 9 31]]
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.86      0.82        37
           1       0.86      0.78      0.82        40

    accuracy                           0.82        77
   macro avg       0.82      0.82      0.82        77
weighted avg       0.82      0.82      0.82        77



In [32]:
# prepare news dataset
X = news.drop(columns=['headline','headline_processed','source'])

In [33]:
# scale news dataset
X = scaler.fit_transform(X)

In [35]:
# predict sensational headlines, add class to dataset
y_pred = xgbc_best.predict(X)
news['sensational'] = y_pred
news = news[['sensational'] + [ col for col in news.columns if col != 'sensational' ] ]

In [37]:
# export results of xgboost classifier
news.to_csv('xgbc_predictions.csv',index=False)