In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

In [2]:
# import baseline, news data sets for classification via xgboost
baseline = pd.read_csv('../../data/baseline_features.csv')
news = pd.read_csv('../../data/news_features.csv')
baseline.head(1)

Unnamed: 0,source,headline,headline_processed,length,unique,frequency,sensational
0,Associated Press,Israeli airstrikes on Gaza resume after weeklo...,isra airstrik gaza resum weeklong truce hama end,2.484907,0.693147,0.0,0


In [3]:
# declare feature, target variables
X = baseline.drop(columns=['source','headline','headline_processed','sensational'])
y = baseline['sensational']

In [4]:
# split data into train, test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 18)

In [6]:
# standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [7]:
# create randomized search hyperparameter options
param_grid = {'max_depth':[3, 6, 9],
              'min_child_weight':[1],
              'gamma':[0, 1, 2],
              'min_samples_leaf':[1, 2, 3],
              'min_samples_split':[0.5, 2, 4],
              'random_state': [18]
           }

In [8]:
xgbc = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic')

In [9]:
xgbc.fit(X_train, y_train)

In [10]:
# predict test set
y_pred = xgbc.predict(X_test)

In [11]:
# evaluate model using accuracy, precision, recall
accuracy = accuracy_score(y_test, y_pred) 
conf_matrix = confusion_matrix(y_test, y_pred) 
classification_rep = classification_report(y_test, y_pred) 
print("Accuracy:", accuracy) 
print("Confusion Matrix:\n", conf_matrix) 
print("Classification Report:\n", classification_rep)

Accuracy: 0.6363636363636364
Confusion Matrix:
 [[30  5]
 [23 19]]
Classification Report:
               precision    recall  f1-score   support

           0       0.57      0.86      0.68        35
           1       0.79      0.45      0.58        42

    accuracy                           0.64        77
   macro avg       0.68      0.65      0.63        77
weighted avg       0.69      0.64      0.62        77



In [12]:
# prepare news dataset
X = news.drop(columns=['headline','headline_processed','source'])

In [13]:
# predict sensational headlines, add class to dataset
y_pred = xgbc.predict(X)
news['sensational'] = y_pred
news = news[['sensational'] + [ col for col in news.columns if col != 'sensational' ] ]

In [14]:
# view predictions by news source
news.groupby(['source','sensational']).size()

source                        sensational
ABC News                      1               1518
Al Jazeera English            1               1287
AllAfrica - Top Africa News   1                 11
Android Central               1                398
BBC News                      1               1490
Boing Boing                   1                614
Business Insider              1               1718
CNA                           1                641
CNN                           1                207
Deadline                      1                708
Digital Trends                1                577
ETF Daily News                1              12290
Euronews                      1                277
Forbes                        1               1806
Gizmodo.com                   1                324
Globalsecurity.org            1               2272
GlobeNewswire                 1               3278
International Business Times  1                782
Marketscreener.com            1         