# Gradient Boosting Model
In diesem Notebook beschreiben wir das Gradient Boosting Verfahren.

## Load Packages

In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

## Load Data

In [2]:
X_train = pd.read_csv('Xtrain_tvshows_featured.csv')
X_test = pd.read_csv('Xtest_tvshows_featured.csv')
y_train = pd.read_csv('ytrain_tvshows.csv')
y_test = pd.read_csv('ytest_tvshows.csv')
print("Shape of X Train: {}".format(X_train.shape))
print("Shape of X Test: {}".format(X_test.shape))
print("Shape of y Train: {}".format(y_train.shape))
print("Shape of y Test: {}".format(y_test.shape))

Shape of X Train: (4294, 7)
Shape of X Test: (1074, 7)
Shape of y Train: (4294, 1)
Shape of y Test: (1074, 1)


In [7]:
X_train.head()

Unnamed: 0,Year,IMDb,Rotten_Tomatoes,Netflix,Hulu,Prime Video,Disney+
0,0.051282,0.6,0.311111,0,0,1,0
1,0.282051,0.694118,0.7,0,1,1,0
2,0.042735,0.717647,0.677778,1,0,0,0
3,0.034188,0.670588,0.0,0,0,1,0
4,0.0,0.576471,0.466667,0,1,0,0


## Model Building

In [8]:
model = GradientBoostingClassifier()

Die Grundeinstellungsparameter sind:
- learning rate: 0.1
- n estimators: 100
- subsample: 1
- min samples split: 2
- min samples leaf: 1
- max depth: 3
- min impurity decrease: 0
- random state: None
- max features: None
- max learf nodes: None


## Cross Validation

In [9]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Accuracy: 0.814 (0.018)


### Plot accuracy

In [10]:
fig = px.scatter(x = range(1,len(n_scores)+1), y = n_scores)
fig.show()

## Fit the Model

In [11]:
# fit the model on the whole dataset
model = GradientBoostingClassifier()
model.fit(X_train, y_train)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



## Prediction

In [12]:
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

## Evaluation

### Confusion Matrix

In [13]:
print("Confusion matrix of the training set: {}".format(metrics.confusion_matrix(y_train,y_pred_train)))
print("Confusion matrix of the test set: {}".format(metrics.confusion_matrix(y_test,y_pred_test)))

Confusion matrix of the training set: [[2303  270]
 [ 451 1270]]
Confusion matrix of the test set: [[573  95]
 [109 297]]


### Accuracy Score

In [14]:
print("Accuracy Score for the training set: {}".format(metrics.accuracy_score(y_train, y_pred_train)))
print("Accuracy Score for the test set: {}".format(metrics.accuracy_score(y_test, y_pred_test)))

Accuracy Score for the training set: 0.8320912901723335
Accuracy Score for the test set: 0.8100558659217877


### Recall and Precision

In [15]:
print("Precision score for the training set: {}".format(metrics.precision_score(y_train,y_pred_train)))
print("Precision score for the test set: {}".format(metrics.precision_score(y_test, y_pred_test)))

Precision score for the training set: 0.8246753246753247
Precision score for the test set: 0.7576530612244898


In [16]:
print("Recall score for the training set: {}".format(metrics.recall_score(y_train,y_pred_train)))
print("Recall score for the test set: {}".format(metrics.recall_score(y_test,y_pred_test)))

Recall score for the training set: 0.7379430563625798
Recall score for the test set: 0.7315270935960592


### F1-score

In [17]:
print("F1 score from training set: {}".format(metrics.f1_score(y_train, y_pred_train)))
print("F1 score from test set: {}".format(metrics.f1_score(y_test,y_pred_test)))

F1 score from training set: 0.7789021772462434
F1 score from test set: 0.7443609022556391


### AUC Score

In [18]:
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_pred_train)
print("AUC score for the training set: {}".format(metrics.auc(fpr, tpr)))

AUC score for the training set: 0.8165035919201162


In [19]:
fpr, tpr, thresholds = metrics.roc_curve(y_test,y_pred_test)
print("AUC score for the test set: {}".format(metrics.auc(fpr,tpr)))

AUC score for the test set: 0.7946557623668918


## Variable Importance

In [20]:
feat_imp = pd.Series(model.feature_importances_).sort_values(ascending=False)
sorted_idx = np.argsort(feat_imp)
feat_imp_df = pd.DataFrame({'vars': X_train.columns[sorted_idx], 'feat_imp': model.feature_importances_})
feat_imp_df.head()

Unnamed: 0,vars,feat_imp
0,Disney+,0.060599
1,Prime Video,0.046406
2,Hulu,0.860368
3,Netflix,0.002642
4,Rotten_Tomatoes,0.015196


In [21]:
fig = px.bar(feat_imp_df.iloc[:10,], x= 'feat_imp', y='vars')
fig.update_yaxes(title_text='Variables')
fig.update_xaxes(title_text='Feature Importance')
fig.update_layout(yaxis = {'categoryorder':'total ascending'})
fig.show()