In [44]:
from sklearn.decomposition import PCA
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.ensemble import (
    AdaBoostClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
)
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
# from xgboost import XGBClassifier

In [45]:
df = pd.read_csv('./data/cleaned_data.csv')
df.head()

Unnamed: 0,national_inv,lead_time,in_transit_qty,forecast_3_month,potential_issue,pieces_past_due,perf_6_month_avg,perf_12_month_avg,local_bo_qty,deck_risk,oe_constraint,ppap_risk,stop_auto_buy,rev_stop,went_on_backorder
0,-0.28,-0.107527,0.0,-0.146179,0.0,0.0,-114.678161,-125.306763,0.0,0.0,0.0,0.0,0.0,0.0,0
1,-0.2,1.0,0.0,-0.146179,0.0,0.0,0.252874,0.267874,0.0,0.0,0.0,0.0,0.0,0.0,0
2,-0.2,-0.107527,0.0,-0.146179,0.0,0.0,-114.678161,-125.306763,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,-0.146179,0.0,0.0,-0.770115,-0.812176,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.04,-0.107527,0.0,-0.146179,0.0,0.0,-114.678161,-125.306763,0.0,0.0,0.0,0.0,0.0,0.0,0


In [46]:
pca = PCA(n_components=0.95)
X = pca.fit_transform(df.drop(columns=['went_on_backorder'], axis=1))
X

array([[142.25396209],
       [-27.70818365],
       [142.25420357],
       ...,
       [-27.47369857],
       [-27.63104216],
       [132.47141621]])

In [47]:
pca.components_

array([[ 3.01855277e-03,  4.28408886e-03, -2.63592833e-03,
         2.76174030e-03, -0.00000000e+00, -0.00000000e+00,
        -6.76765495e-01, -7.34111407e-01,  5.50124802e-02,
        -0.00000000e+00, -0.00000000e+00, -0.00000000e+00,
        -7.02770117e-04,  0.00000000e+00]])

In [48]:
y = df['went_on_backorder']
y

0       0
1       0
2       0
3       0
4       0
       ..
1899    1
1900    1
1901    1
1902    1
1903    1
Name: went_on_backorder, Length: 1904, dtype: int64

In [49]:
# Model Training
models = {
            "Random Forest": RandomForestClassifier(),
            "Decision Tree": DecisionTreeClassifier(),
            "Gradient Boosting": GradientBoostingClassifier(),
            "K-Neighbors Classifier": KNeighborsClassifier(),
            # "XGBClassifier": XGBClassifier(),
            "AdaBoost Classifier": AdaBoostClassifier(),
            }

In [50]:
result = {}

In [51]:
for i in list(models):
    model = models[i]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    result[i] = accuracy_score(y_pred, y_test)

In [52]:
result

{'Random Forest': 0.9660574412532638,
 'Decision Tree': 0.9530026109660574,
 'Gradient Boosting': 0.9660574412532638,
 'K-Neighbors Classifier': 0.9556135770234987,
 'AdaBoost Classifier': 0.9608355091383812}