In [55]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler

In [56]:
# import baseline, news data sets for classification via random forest
baseline = pd.read_csv('../../data/baseline_features.csv')
news = pd.read_csv('../../data/news_features.csv')
baseline.head(1)

Unnamed: 0,source,headline,headline_processed,length,unique,frequency,sensational
0,Associated Press,Israeli airstrikes on Gaza resume after weeklo...,isra airstrik gaza resum weeklong truce hama end,2.484907,0.693147,0.0,0


In [57]:
# declare feature, target variables
X = baseline.drop(columns=['source','headline','headline_processed','sensational'])
y = baseline['sensational']

In [58]:
# split data into train, test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [59]:
# standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [61]:
# create randomized search hyperparameter options
param_grid = {'max_depth':[3, 5, 10, None],
              'n_estimators':[50, 100, 200],
              'max_features':['sqrt','log2', 1],
              'min_samples_leaf':[1, 2, 3],
              'min_samples_split':[0.5, 2, 4]
           }

In [62]:
# create randomized search to find best hyper parameters
rfc = RandomForestClassifier()

search = RandomizedSearchCV(rfc, param_grid, cv=5, scoring='accuracy') 
search.fit(X_train, y_train)

In [63]:
# get best randomized search hyperparamters
best_params = search.best_params_
print(best_params)

{'n_estimators': 50, 'min_samples_split': 0.5, 'min_samples_leaf': 3, 'max_features': 'sqrt', 'max_depth': 5}


In [66]:
# create more specific grid search hyperparameter options
param_grid = {'max_depth':[4, 5, 6],
              'n_estimators':[25, 50, 75],
              'max_features':['sqrt'],
              'min_samples_leaf':[3, 4, 5],
              'min_samples_split':[0.25, 0.5 , 0.75]
           }

In [67]:
# create grid search to find best hyper parameters
rfc = RandomForestClassifier()

search = GridSearchCV(rfc, param_grid, cv=5, scoring='accuracy') 
search.fit(X_train, y_train)

In [68]:
# get best grid search hyperparamters
best_params = search.best_params_
print(best_params)

{'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 0.5, 'n_estimators': 50}


In [69]:
# Fit the model with the best hyperparameters 
rfc_best = RandomForestClassifier(**best_params) 
rfc_best.fit(X_train, y_train)

# Make predictions 
y_pred = rfc_best.predict(X_test)

In [70]:
# evaluate model using accuracy, precision, recall
accuracy = accuracy_score(y_test, y_pred) 
conf_matrix = confusion_matrix(y_test, y_pred) 
classification_rep = classification_report(y_test, y_pred) 
print("Accuracy:", accuracy) 
print("Confusion Matrix:\n", conf_matrix) 
print("Classification Report:\n", classification_rep)

Accuracy: 0.6753246753246753
Confusion Matrix:
 [[39  4]
 [21 13]]
Classification Report:
               precision    recall  f1-score   support

           0       0.65      0.91      0.76        43
           1       0.76      0.38      0.51        34

    accuracy                           0.68        77
   macro avg       0.71      0.64      0.63        77
weighted avg       0.70      0.68      0.65        77



In [71]:
# prepare news dataset
X = news.drop(columns=['headline','headline_processed','source'])

In [73]:
# predict sensational headlines, add class to dataset
y_pred = rfc_best.predict(X)
news['sensational'] = y_pred
news = news[['sensational'] + [ col for col in news.columns if col != 'sensational' ] ]



In [74]:
# view predictions by news source
news.groupby(['source','sensational']).size()

source                        sensational
ABC News                      1               1518
Al Jazeera English            1               1287
AllAfrica - Top Africa News   1                 11
Android Central               1                398
BBC News                      1               1490
Boing Boing                   1                614
Business Insider              1               1718
CNA                           1                641
CNN                           1                207
Deadline                      1                708
Digital Trends                1                577
ETF Daily News                1              12290
Euronews                      1                277
Forbes                        1               1806
Gizmodo.com                   1                324
Globalsecurity.org            1               2272
GlobeNewswire                 1               3278
International Business Times  1                782
Marketscreener.com            1         

In [None]:
# export results of random forest version 1
news.to_csv('rfc1.csv',index=False)