In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# import baseline, news data sets for classification via random forest
baseline = pd.read_csv('../../data/baseline_features.csv')
news = pd.read_csv('../../data/news_features.csv')
baseline.head(1)

Unnamed: 0,source,headline,headline_processed,headline_length,unique_words,buzzword_frequency,word_length,word_length_processed,upper_words,sensational
0,Associated Press,Israeli airstrikes on Gaza resume after weeklo...,isra airstrik gaza resum weeklong truce hama end,11,0.693147,0.0,5.454545,5.125,0.241162,0


In [3]:
# declare feature, target variables
X = baseline.drop(columns=['source','headline','headline_processed','sensational'])
y = baseline['sensational']

In [4]:
# split data into train, test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [5]:
# standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [6]:
# create randomized search hyperparameter options
param_grid = {'max_depth':[3, 5, 10, None],
              'n_estimators':[50, 100, 200],
              'max_features':['sqrt','log2', 1],
              'min_samples_leaf':[1, 2, 3],
              'min_samples_split':[0.5, 2, 4]
           }

In [7]:
# create randomized search to find best hyper parameters
rfc = RandomForestClassifier()

search = RandomizedSearchCV(rfc, param_grid, cv=5, scoring='accuracy') 
search.fit(X_train, y_train)

In [8]:
# get best randomized search hyperparamters
best_params = search.best_params_
print(best_params)

{'n_estimators': 100, 'min_samples_split': 0.5, 'min_samples_leaf': 2, 'max_features': 1, 'max_depth': 5}


In [11]:
# create more specific grid search hyperparameter options
param_grid = {'max_depth':[4, 5, 6],
              'n_estimators':[75, 100, 125],
              'max_features':['sqrt', 1, 2],
              'min_samples_leaf':[1, 2, 3],
              'min_samples_split':[0.25, 0.5 , 0.75]
           }

In [12]:
# create grid search to find best hyper parameters
rfc = RandomForestClassifier()

search = GridSearchCV(rfc, param_grid, cv=5, scoring='accuracy') 
search.fit(X_train, y_train)

In [13]:
# get best grid search hyperparamters
best_params = search.best_params_
print(best_params)

{'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 3, 'min_samples_split': 0.25, 'n_estimators': 75}


In [14]:
# fit the model with the best hyperparameters 
rfc_best = RandomForestClassifier(**best_params) 
rfc_best.fit(X_train, y_train)

# make predictions 
y_pred = rfc_best.predict(X_test)

In [15]:
# evaluate model using accuracy, precision, recall
accuracy = accuracy_score(y_test, y_pred) 
conf_matrix = confusion_matrix(y_test, y_pred) 
classification_rep = classification_report(y_test, y_pred) 
print("Accuracy:", accuracy) 
print("Confusion Matrix:\n", conf_matrix) 
print("Classification Report:\n", classification_rep)

Accuracy: 0.8311688311688312
Confusion Matrix:
 [[26 10]
 [ 3 38]]
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.72      0.80        36
           1       0.79      0.93      0.85        41

    accuracy                           0.83        77
   macro avg       0.84      0.82      0.83        77
weighted avg       0.84      0.83      0.83        77



In [16]:
# prepare news dataset
X = news.drop(columns=['headline','headline_processed','source'])

In [17]:
# scale news data set
X = scaler.fit_transform(X)

In [18]:
# predict sensational headlines, add class to dataset
y_pred = rfc_best.predict(X)
news['sensational'] = y_pred
news = news[['sensational'] + [ col for col in news.columns if col != 'sensational' ] ]

In [19]:
# view predictions by news source
news.groupby(['source','sensational']).size()

source                        sensational
ABC News                      0                947
                              1                571
Al Jazeera English            0                802
                              1                485
AllAfrica - Top Africa News   1                 11
Android Central               0                218
                              1                180
BBC News                      0                916
                              1                574
Boing Boing                   0                383
                              1                231
Business Insider              0                882
                              1                836
CNA                           0                510
                              1                131
CNN                           0                124
                              1                 83
Deadline                      0                283
                              1         

In [20]:
# export results of random forest version 1
news.to_csv('rfc1.csv',index=False)