# Modelling

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (confusion_matrix, classification_report, plot_roc_curve, roc_auc_score, 
accuracy_score, precision_score, recall_score, f1_score, auc, precision_recall_curve, average_precision_score)
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, 
                              AdaBoostClassifier)
from sklearn.svm import SVC

import xgboost as xgb
from xgboost import XGBClassifier

In this notebook, we'll carry out of the following steps

- Run a Train-Test-Split on our data
- Run data through a pipeline and fit various models to our training data
- Select the best model and tune hyper-parameters
- Evaluate model based on accuracy and ROC-AUC

We'll look to test a range of classification techniques including Logistic Regression, Random Forest, Boosting, Multinomial Naive Bayes classification and Support Vector Machine (SVM) classification.

Accuracy and ROC-AUC will be our main metrics -- we want to minimize both false negatives and false positives, but we also want to know how good our models are at separating our positive and negative class.

It's worth noting that in previous iterations of my modelling I tried modelling with purely text-based features (headline + abstract). This returned an unsatisfactory accuracy and ROC-AUC score.

In [None]:
train = pd.read_csv('/kaggle/input/new-york-times-articles-feature-engineering/train_processed.csv', converters={'keywords': eval}, parse_dates=['pub_date'])
train = train._get_numeric_data().drop(columns='n_comments', errors='ignore')

In [None]:
test = pd.read_csv('/kaggle/input/new-york-times-articles-feature-engineering/test_processed.csv', converters={'keywords': eval}, parse_dates=['pub_date'])

In [None]:
# Save & drop target variable from test
final_actual = test['is_popular']
test = test._get_numeric_data().drop(columns=['n_comments', 'is_popular'], errors='ignore')

## Train Test Split

In [None]:
X = train.drop(columns=['is_popular'])
y = train['is_popular']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Instantiate Models

In [None]:
feature_names = X_train.columns

In [None]:
models = {'lr': LogisticRegression(max_iter=5_000, random_state=42, solver='saga'),
          'rf': RandomForestClassifier(random_state=42),
          'gb': GradientBoostingClassifier(random_state=42),
          'et': ExtraTreesClassifier(random_state=42),
          'ada': AdaBoostClassifier(random_state=42),
          'svc': SVC(random_state=42, probability=True),
          'xgb': xgb.XGBClassifier(seed=42, use_label_encoder=False, eval_metric='auc'),
        }

## Model Function Setup

To get various feature importance scores from XGBoost, I created a custom transformer that scales all features with StandardScaler and returns it as a dataframe (instead of an array).

In [None]:
class CustomTransformer():
    def __init__(self, func):
        self.func = func

    def transform(self, input_df, **transform_params):
        return self.func(input_df)

    def fit(self, X, y=None, **fit_params):
        return self

# This function takes a dataframe as input and returns a scaled version
def scale_df(input_df):
    ss = StandardScaler()
    feature_names = input_df.columns
    input_df = ss.fit_transform(input_df)
    input_df = pd.DataFrame(input_df, columns=feature_names)
    return input_df

In [None]:
# Instantiate lists to store results
init_list = []
gs_list = []

# Function to run model -- input scaler and model
def run_model(mod, mod_params={}, grid_search=False):
    
    # Initial dictionary to hold model results
    results = {}
    
    pipe = Pipeline([
            ('ss', CustomTransformer(scale_df)),
            (mod, models[mod])
            ])
    
    if grid_search:
        # Instantiate list to store gridsearch results
        gs = GridSearchCV(pipe, param_grid=mod_params, cv=3, verbose=1, scoring='roc_auc', n_jobs=-1)
        gs.fit(X_train, y_train)
        pipe = gs.best_estimator_
        
    else:
        pipe.fit(X_train, y_train)
    
    # Retrieve metrics
    predictions = pipe.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
    y_test_pred_prob = pipe.predict_proba(X_test)[:,1]
    y_train_pred_prob = pipe.predict_proba(X_train)[:,1]
    
    results['model'] = mod
    results['train_auc'] = roc_auc_score(y_train, y_train_pred_prob)
    results['test_auc'] = roc_auc_score(y_test, y_test_pred_prob)
    results['precision'] = precision_score(y_test, predictions)
    results['specificity'] = tn / (tn + fp)
    results['recall'] = recall_score(y_test, predictions)
    results['f_score'] = f1_score(y_test, predictions)
    
    if grid_search:
        gs_list.append(results)
        print('### BEST PARAMS ###')
        display(pipe[1])
        
    else:
        init_list.append(results)
    
    print('### METRICS ###')
    display(results)
    
    print(f"True Negatives: {tn}")
    print(f"False Positives: {fp}")
    print(f"False Negatives: {fn}")
    print(f"True Positives: {tp}")
    
    return pipe

## Initial Run

In [None]:
lr = run_model('lr')

In [None]:
rf = run_model('rf')

In [None]:
et = run_model('et')

In [None]:
gb = run_model('gb')

In [None]:
svc = run_model('svc')

In [None]:
ada = run_model('ada')

In [None]:
xgb = run_model('xgb')

In [None]:
# Results of our initial modelling
pd.DataFrame(init_list).sort_values(by='test_auc', ascending=False).reset_index(drop=True)

## Hyperparameter Tuning

### Logistic Regression

In [None]:
lr_params = {
    # Trying different types of regularization
    'lr__penalty':['l2','l1', 'elasticnet'],
    
    'lr__l1_ratio':[0, 0.5, 1],

     # Trying different alphas of: 10, 1, 0.1  (C = 1/alpha)
    'lr__C':[0.1, 1, 10],
}

In [None]:
lr_gs = run_model('lr', mod_params=lr_params, grid_search=True)

### SVC

In [None]:
svc_params = {
    'svc__C':[0.05, 1],
    'svc__gamma':[0.05, 0.1], 
    'svc__kernel':['rbf', 'sigmoid'],
}

In [None]:
svc_gs =  run_model('svc', mod_params=svc_params, grid_search=True)

### Random Forest

In [None]:
rf_params = {'rf__n_estimators': [100, 200],
             'rf__max_depth': [20, 25, 30, 40],
             'rf__min_samples_leaf': [2, 3, 4],
            }

In [None]:
rf_gs =  run_model('rf', mod_params=rf_params, grid_search=True)

### Extra Trees

In [None]:
et_params = {'et__n_estimators': [100, 200],
             'et__max_depth': [20, 25, 30, 35, 40, 50],
             'et__min_samples_leaf': [2, 3, 4],
            }

In [None]:
et_gs =  run_model('et', mod_params=et_params, grid_search=True)

### Adaptive Boosting

In [None]:
ada_params = {'ada__n_estimators': [500],
              'ada__learning_rate': [0.05, 0.1, 0.2, 0.9],
             }

In [None]:
ada_gs = run_model('ada', mod_params=ada_params, grid_search=True)

### Gradient Boosting

In [None]:
gb_params = {'gb__n_estimators': [500, 1000],
             'gb__learning_rate': [0.1, 0.2, 0.3],
}

In [None]:
gb_gs = run_model('gb', mod_params=gb_params, grid_search=True)

### XGBoost

In [None]:
xgb_params = { 
              'xgb__learning_rate' : [0.05, 0.1],
              'xgb__max_depth' : [6, 7], 
              'xgb__n_estimators' : [100],
              'xgb__reg_alpha' : [0, 2],
              'xgb__reg_lambda' : [0, 2],
              'xgb__gamma' : [0, 2],
             }

In [None]:
xgb_gs = run_model('xgb', mod_params=xgb_params, grid_search=True)

## ROC-AUC Evaluation

In [None]:
gs_df = pd.DataFrame(gs_list)
gs_df.sort_values(by='test_auc', ascending=False)

In [None]:
gs_dict = {
    xgb_gs: 'XGBoostClassifier',
    et_gs: 'ExtraTreeClassifier',
    rf_gs: 'RandomForest',
    gb_gs: 'GradientBoostingClassifier',
    svc_gs: 'SupportVectorMachineClf',
    lr_gs: 'LogisticRegression',
    ada_gs: 'AdaBoostClassifier',
}

In [None]:
def roc_curve_plotter(model_dict, plot_top=False):
    fig, ax = plt.subplots(1, 1, figsize=(12,10))
    axes = {}
    for i, m in enumerate(model_dict.keys()):
        axes[f'ax{i}'] = plot_roc_curve(m, X_test, y_test, ax=ax, name=model_dict[m])
    if plot_top:
        for i, a in enumerate(axes):
            if i != 0:
                axes[a].line_.set_color('gray')
                axes[a].line_.set_alpha(0.25)
                
    plt.plot([0, 1], [0, 1], color='black', lw=2, linestyle='--', label='Random Guess')
    plt.title('ROC-AUC Curve', fontsize=22)
    plt.xlabel('False Positive Rate', fontsize=12)
    plt.ylabel('True Positive Rate', fontsize=12)
    plt.legend(fontsize=12)

Performance is pretty close between our ensemble tree models and XGBoost -- we can see that XGBoost has a TPR of 0.8 to a FPR of roughly 0.21.

In [None]:
roc_curve_plotter(gs_dict, True)

## Testing against unseen data

At this point, we're taking our data and testing it on "unseen" data, or the test set that we earlier created. 

In [None]:
def evaluate_model(model):
    final_proba = model.predict_proba(test)
    final_proba = final_proba[:, 1]
    final_preds = model.predict(test)
    print(f'Accuracy:', f'{accuracy_score(final_actual, final_preds):.3f}')
    print(f'Precision:', f'{precision_score(final_actual, final_preds):.3f}')
    print(f'Recall:', f'{recall_score(final_actual, final_preds):.3f}')
    print(f'ROC-AUC:', f'{roc_auc_score(final_actual, final_proba):.3f}')

In [None]:
evaluate_model(xgb_gs)

In [None]:
evaluate_model(rf_gs)

In [None]:
evaluate_model(et_gs)

#### Refit models on entire dataset

In [None]:
xgb_gs.fit(X, y)
rf_gs.fit(X, y)
et_gs.fit(X, y);

Performance for our top models is pretty close -- but looks like XGBoost wins. It has the highest accuracy and AUC-ROC score.

In [None]:
evaluate_model(xgb_gs)

In [None]:
evaluate_model(rf_gs)

In [None]:
evaluate_model(et_gs)

### Misclassification Analysis

In [None]:
final_proba = rf_gs.predict_proba(test)
final_proba = final_proba[:, 1]
final_preds = rf_gs.predict(test)

In [None]:
predictions = pd.DataFrame(data=[final_actual, final_preds, final_proba]).T
predictions.columns = ['actual', 'predicted', 'proba']

In [None]:
predictions

In [None]:
wrong_predictions = predictions[predictions['actual'] != predictions['predicted']]

In [None]:
init_test = pd.read_csv('/kaggle/input/new-york-times-articles-feature-engineering/test_processed.csv', converters={'keywords': eval}, parse_dates=['pub_date'])

In [None]:
wrong_predictions = pd.merge(wrong_predictions, init_test, left_index=True, right_index=True)

In [None]:
wrong_predictions = wrong_predictions[['actual', 'predicted', 'proba', 'newsdesk', 'section', 'subsection',
                                       'headline', 'abstract', 'keywords']]

In [None]:
# 1 = false positives, 0 = false negatives
wrong_predictions['actual'].value_counts()

In [None]:
# Looking at false positives
wrong_predictions[wrong_predictions['proba'] > 0.80].head(10).sort_values(by='proba', ascending=False)

In [None]:
# Looking at false negatives
wrong_predictions[wrong_predictions['proba'] < 0.4].head(10).sort_values(by='proba')

## Model Insights

Generally, our most important features regardless of model include newsdesk and section popularity. It goes to show that you can't ignore where an article is posted or placed when it comes to predicting popularity.

### Random Forest Feature Importance

In [None]:
plt.figure(figsize=(10,12))
rf_feature_imp = pd.Series(rf_gs[1][1].feature_importances_, index=X.columns).sort_values(ascending=False)
rf_feature_imp.sort_values().plot(kind = 'barh')

### XGBoost Feature Importance

In [None]:
scores_dict = xgb_gs.steps[1][1]._Booster.get_score(importance_type='gain')

In [None]:
total_cover = pd.DataFrame(xgb_gs.steps[1][1]._Booster.get_score(importance_type='total_cover'), index=[0]) \
            .T.reset_index()
total_cover.columns = ['feature', 'total cover']
total_cover = total_cover.sort_values(by='total cover', ascending=False)

In [None]:
total_gain = pd.DataFrame(xgb_gs.steps[1][1]._Booster.get_score(importance_type='total_gain'), index=[0]) \
            .T.reset_index()
total_gain.columns = ['feature', 'total gain']
total_gain = total_gain.sort_values(by='total gain', ascending=False)

In [None]:
gain_df = pd.DataFrame(xgb_gs.steps[1][1]._Booster.get_score(importance_type='gain'), index=[0]) \
            .T.reset_index()
gain_df.columns = ['feature', 'gain']
gain_df = gain_df.sort_values(by='gain', ascending=False)

In [None]:
weight_df = pd.DataFrame(xgb_gs.steps[1][1]._Booster.get_score(importance_type='weight'), index=[0]) \
            .T.reset_index()
weight_df.columns = ['feature', 'weight']
weight_df = weight_df.sort_values(by='weight', ascending=False)

In [None]:
cover_df = pd.DataFrame(xgb_gs.steps[1][1]._Booster.get_score(importance_type='cover'), index=[0]) \
            .T.reset_index()
cover_df.columns = ['feature', 'cover']
cover_df = cover_df.sort_values(by='cover', ascending=False)

In [None]:
plt.figure(figsize=(8,12))
sns.barplot(data=gain_df, y='feature', x='gain', orient='h', palette='Reds_r')
plt.ylabel('');
plt.title('Gain', fontsize=18);

In [None]:
plt.figure(figsize=(8,12))
sns.barplot(data=weight_df, y='feature', x='weight', orient='h', palette='Blues_r')
plt.ylabel('');
plt.title('Weight', fontsize=18);

In [None]:
plt.figure(figsize=(8,12))
sns.barplot(data=cover_df, y='feature', x='cover', orient='h', palette='Greens_r')
plt.ylabel('');
plt.title('Cover', fontsize=18);

In [None]:
plt.figure(figsize=(8,12))
sns.barplot(data=total_gain, y='feature', x='total gain', orient='h', palette='Oranges_r')
plt.ylabel('');
plt.xlabel('Total Gain', fontsize=12)
plt.title('Total Gain', fontsize=18)
plt.tight_layout()

In [None]:
plt.figure(figsize=(8,12))
sns.barplot(data=total_cover, y='feature', x='total cover', orient='h', palette='Purples_r')
plt.ylabel('');
plt.xlabel('Total Cover', fontsize=12)
plt.title('Total Cover', fontsize=18);