## Importing basic tools

In [2]:
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

## Load the preprocessed data

In [3]:
df = pd.read_csv('../data/data_pp.csv')
df_fit = df.drop(['date_day','id','album_id','date_month','decade'],axis=1) #as mentioned in the report

## Principal Component Analysis

In [None]:
from sklearn.model_selection import TimeSeriesSplit

X = df_fit.drop(['billboard'],axis=1)
y = df_fit['billboard']



splitter = TimeSeriesSplit(n_splits=2)
for i_other,i_test in splitter.split(X, y):
    X_other, y_other = X.iloc[i_other], y.iloc[i_other]
    X_test, y_test = X.iloc[i_test], y.iloc[i_test]

In [None]:
from sklearn.decomposition import PCA

columns = []
for i in range(2):
    columns.append('principal component '+str(i))
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(X)
principalDf = pd.DataFrame(data = principalComponents
             , columns = columns)

principalDf.head()
finalDf = pd.concat([principalDf, df_fit[['billboard']]], axis = 1)
finalDf.columns

### Plot the first two principal axis

In [None]:
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
targets = [0.0,1.0]
colors = ['r', 'g']
for target, color in zip(targets,colors):
    indicesToKeep = (finalDf['billboard'] == target)
    ax.scatter(finalDf.loc[indicesToKeep, 'principal component 0']
               , finalDf.loc[indicesToKeep, 'principal component 1']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()
plt.savefig('../figures/PCA1_2.png') 

---

## XGBoost

In [None]:
file = open('../results/grid0.xgboost', 'rb')
grid_xgboost = pickle.load(file)
file.close()

In [None]:
grid_xgboost.best_estimator_

In [None]:
X_test.columns

### Random Premutation test

In [None]:
nr_runs = 10
scores = np.zeros([len(X_test.columns),nr_runs])

test_score = grid_xgboost.score(X_test,y_test)
print('test score = ',test_score)
print('test baseline = ',np.sum(y_test == 1)/len(y_test))
# loop through the features
for i in range(len(X_test.columns)):
    print('shuffling '+str(X_test.columns[i]))
    acc_scores = []
    for j in range(nr_runs):
        X_test_shuffled = X_test.copy()
        X_test_shuffled[X_test.columns[i]] = np.random.permutation(X_test[X_test.columns[i]].values)
        acc_scores.append(grid_xgboost.score(X_test_shuffled,y_test))
    print('   shuffled test score:',np.around(np.mean(acc_scores),8),'+/-',np.around(np.std(acc_scores),8))
    scores[i] = acc_scores

In [None]:
#Plot the difference in score for each feature

sorted_indcs = np.argsort(np.mean(scores,axis=1))[::-1]
plt.rcParams.update({'font.size': 14})
plt.figure(figsize=(8,6))
plt.boxplot(scores[sorted_indcs].T,labels=(X_test.columns)[sorted_indcs],vert=False)
plt.axvline(test_score,label='test score')
plt.title("Permutation Importances (test set)")
plt.xlabel('score with perturbed feature')
plt.legend()
plt.tight_layout()
plt.savefig('../figures/feature_perm_imp_xgboost.png')
plt.show()

### Also comparing with SelectFromModel and using xgboost's plot_importance

In [None]:
from sklearn.feature_selection import SelectFromModel

xgb = grid_xgboost.best_estimator_
xgb = xgb['xgbclassifier']

from xgboost import plot_importance
plot_importance(xgb)
plt.tight_layout()
plt.savefig('../figures/feature_imp_sfm_xgboost.png')

---

## Random Forest 

In [None]:
ls ../results/randomForest/

In [4]:
for i in ['grid0.RandomForest','grid1.RandomForest','grid2.RandomForest','grid3.RandomForest']:
    file = open('../results/randomForest/'+i, 'rb')
    grid = pickle.load(file)
    file.close()
    print(grid.best_params_)
    print('best CV score:',grid.best_score_)
    print('test score:',grid.score(X_test,y_test))

{'randomforestclassifier__max_depth': 10, 'randomforestclassifier__min_samples_split': 5}
best CV score: 0.8950995295136275


NameError: name 'X_test' is not defined

### choosing the best score with minimum splits

In [None]:
file = open('../results/randomForest/grid1.RandomForest', 'rb')
grid_randomforest = pickle.load(file)
file.close()
rndfor = grid_randomforest.best_estimator_['randomforestclassifier']

In [None]:
rndfor

### Random Permutation Feature Importance

In [None]:
nr_runs = 10
scores = np.zeros([len(X_test.columns),nr_runs])

test_score = grid_randomforest.score(X_test,y_test)
print('test score = ',test_score)
print('test baseline = ',np.sum(y_test == 1)/len(y_test))
# loop through the features
for i in range(len(X_test.columns)):
    print('shuffling '+str(X_test.columns[i]))
    acc_scores = []
    for j in range(nr_runs):
        X_test_shuffled = X_test.copy()
        X_test_shuffled[X_test.columns[i]] = np.random.permutation(X_test[X_test.columns[i]].values)
        acc_scores.append(grid_randomforest.score(X_test_shuffled,y_test))
    print('   shuffled test score:',np.around(np.mean(acc_scores),3),'+/-',np.around(np.std(acc_scores),3))
    scores[i] = acc_scores

In [None]:
sorted_indcs = np.argsort(np.mean(scores,axis=1))[::-1]
plt.rcParams.update({'font.size': 14})
plt.figure(figsize=(8,6))
plt.boxplot(scores[sorted_indcs].T,labels=(X_test.columns)[sorted_indcs],vert=False)
plt.axvline(test_score,label='test score')
plt.title("Permutation Importances (test set)")
plt.xlabel('score with perturbed feature')
plt.legend()
plt.tight_layout()
plt.savefig('../figures/feature_perm_imp_randfor.png')
plt.show()

### Using sklearn random forests own feature_importance metric

In [None]:
names = X_test.columns
print("Features sorted by their score:")
print(sorted(zip(map(lambda x: round(x, 4), rndfor.feature_importances_), names), 
             reverse=True))

In [None]:
importances = rndfor.feature_importances_
std = np.std([tree.feature_importances_ for tree in rndfor.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X_other.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X_other.shape[1]), X_other.columns[indices],rotation=90)
plt.xlim([-1, X_other.shape[1]])
plt.tight_layout()
plt.savefig('../figures/feature_imp_randfor.png')
plt.show()

### Print a forest from the list of fit forests

In [None]:
from sklearn.tree import export_graphviz
from subprocess import call
from IPython.display import Image

estimator = rndfor.estimators_[0]

export_graphviz(estimator, out_file='../figures/tree_randfor.dot', 
                feature_names = X_other.columns,
                class_names = ['0','1'],
                rounded = True, proportion = False, 
                precision = 2, filled = True)

call(['dot', '-Tpng', '../figures/tree_randfor.dot', '-o', '../figures/tree_randfor.png', '-Gdpi=600'])

# Display in jupyter notebook

Image(filename = '../figures/tree_randfor.png')

In [None]:
rndfor.estimators_[1]

---

## AdaBoost Classifier

In [None]:
for i in ['grid0.adaboost','grid1.adaboost','grid2.adaboost','grid3.adaboost']:
    file = open('../results/adaboost/'+i, 'rb')
    grid = pickle.load(file)
    file.close()
    print(grid.best_params_)
    print('best CV score:',grid.best_score_)
    print('test score:',grid.score(X_test,y_test))

### choosing the best score parameters

In [None]:
file = open('../results/adaboost/grid1.adaboost', 'rb')
grid_adaboost = pickle.load(file)
file.close()
adbc = grid_adaboost.best_estimator_['adaboostclassifier']
adbc.estimator_errors_

### Plotting the tree with the least error

In [None]:
adbc.estimators_[0]

In [None]:
from sklearn.tree import export_graphviz
from subprocess import call
from IPython.display import Image

estimator = adbc.estimators_[0]
export_graphviz(estimator, out_file='../figures/tree_adaboost.dot', 
                feature_names = X_other.columns,
                class_names = ['0','1'],
                rounded = True, proportion = False, 
                precision = 2, filled = True)

call(['dot', '-Tpng', '../figures/tree_adaboost.dot', '-o', '../figures/tree_adaboost.png', '-Gdpi=600'])

# Display in jupyter notebook

Image(filename = '../figures/tree_adaboost.png')

### Using sklearn adaboosts own feature_importance metric

In [None]:
importances = adbc.feature_importances_
std = np.std([tree.feature_importances_ for tree in adbc.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X_other.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X_other.shape[1]), X_other.columns[indices],rotation=90)
plt.xlim([-1, X_other.shape[1]])
plt.tight_layout()
plt.savefig('../figures/feature_imp_adaboost.png')
plt.show()

### Random Permutation Feature Importance

In [None]:
nr_runs = 10
scores = np.zeros([len(X_test.columns),nr_runs])

test_score = grid_adaboost.score(X_test,y_test)
print('test score = ',test_score)
print('test baseline = ',np.sum(y_test == 1)/len(y_test))
# loop through the features
for i in range(len(X_test.columns)):
    print('shuffling '+str(X_test.columns[i]))
    acc_scores = []
    for j in range(nr_runs):
        X_test_shuffled = X_test.copy()
        X_test_shuffled[X_test.columns[i]] = np.random.permutation(X_test[X_test.columns[i]].values)
        acc_scores.append(grid_randomforest.score(X_test_shuffled,y_test))
    print('   shuffled test score:',np.around(np.mean(acc_scores),3),'+/-',np.around(np.std(acc_scores),3))
    scores[i] = acc_scores

In [None]:
sorted_indcs = np.argsort(np.mean(scores,axis=1))[::-1]
plt.rcParams.update({'font.size': 14})
plt.figure(figsize=(8,6))
plt.boxplot(scores[sorted_indcs].T,labels=(X_test.columns)[sorted_indcs],vert=False)
plt.axvline(test_score,label='test score')
plt.title("Permutation Importances (test set)")
plt.xlabel('score with perturbed feature')
plt.legend()
plt.tight_layout()
plt.savefig(plt.savefig('../figures/feature_perm_imp_adaboost.png'))
plt.show()

---

## Plotting the Confusion Matrices

In [None]:
from sklearn.metrics import confusion_matrix

y_true = y_test
y_pred = grid_randomforest.predict(X_test) # I usually replaced them with the appropriate grid names 
                                           # grid_xgboost or grid_adaboost
print(confusion_matrix(y_true,y_pred))

In [None]:
from sklearn.utils.multiclass import unique_labels

def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=True,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes = np.array(classes)
    #classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        
    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

In [None]:
plot_confusion_matrix(y_true,y_pred,classes=['not billboard','billboard'])
plt.tight_layout()
plt.savefig('../figures/randomforest_cm_normed.png')
plt.show()