In [14]:
import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler

In [15]:
baseline = pd.read_csv('../../data/baseline_features.csv')
news = pd.read_csv('../../data/news_features.csv')
baseline.head(1)

Unnamed: 0,source,headline,headline_processed,length,unique,frequency,class
0,Reuters,Germany's Kuehne examines offer for Signa's Ha...,germani kuehn examin offer signa hamburg skysc...,2.397895,0.693147,0.0,0


In [16]:
# declare feature (X), target (y) variables
X = baseline.drop(columns=['source','headline','headline_processed','class'])
y = baseline['class']

In [17]:
# split data into train, test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 18)

# check split data shape
X_train.shape, X_test.shape

((231, 3), (58, 3))

In [18]:
# standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [19]:
# create classifier 
rfc = RandomForestClassifier(n_estimators=100, random_state=18)

In [20]:
# fit model
rfc.fit(X_train, y_train)

In [21]:
# predict test set
y_pred = rfc.predict(X_test)

In [24]:
# check feature scores for feature selection
feature_scores = pd.Series(rfc.feature_importances_).sort_values(ascending=False)
feature_scores

2    0.484387
0    0.415307
1    0.100306
dtype: float64

In [25]:
# evaluate model using accuracy, precision, recall
accuracy = accuracy_score(y_test, y_pred) 
conf_matrix = confusion_matrix(y_test, y_pred) 
classification_rep = classification_report(y_test, y_pred) 
print("Accuracy:", accuracy) 
print("Confusion Matrix:\n", conf_matrix) 
print("Classification Report:\n", classification_rep)

Accuracy: 0.5517241379310345
Confusion Matrix:
 [[ 7 13]
 [13 25]]
Classification Report:
               precision    recall  f1-score   support

           0       0.35      0.35      0.35        20
           1       0.66      0.66      0.66        38

    accuracy                           0.55        58
   macro avg       0.50      0.50      0.50        58
weighted avg       0.55      0.55      0.55        58



In [26]:
# selecting random forest hyperparameters, model for tuning via grid search cv
param_grid = {
    'n_estimators': [10, 50, 100, 300, 500],
    'max_depth': [None,'5','20','35','50'],
    'max_features': [None,'sqrt','log2'],
    'min_samples_leaf': [0.01, 0.1, 1, 5, 10],
    'min_samples_split': [0.01, 0.1, 1, 5, 10],
    'max_leaf_nodes': [None, 10, 100, 500, 1000],
    'random_state': [18]
}

model =  RandomForestClassifier()

In [27]:
# create grid search model
grid_search = GridSearchCV(estimator = model, param_grid = param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
# get the best parameters 
best_params = grid_search.best_params_

In [None]:
# fit the model with the best hyperparameters 
randfor_best = RandomForestClassifier(**best_params) 
randfor_best.fit(X_train, y_train)

In [None]:
# make new predictions 
y_pred = randfor_best.predict(X_test)

In [None]:
# evaluate tuned model 
accuracy = accuracy_score(y_test, y_pred) 
conf_matrix = confusion_matrix(y_test, y_pred) 
classification_rep = classification_report(y_test, y_pred) 
print("Accuracy:", accuracy) 
print("Confusion Matrix:\n", conf_matrix) 
print("Classification Report:\n", classification_rep)

In [None]:
# create classification model for xgboost
dtrain_clf = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_clf = xgb.DMatrix(X_test, y_test, enable_categorical=True)