In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [2]:
df = pd.read_csv("notebook/cleaned_data.csv")

In [3]:
# Split the data into features and target variable
X = df.drop('went_on_backorder', axis=1)
y = df['went_on_backorder']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

categorical_features= X.select_dtypes('object').columns
numerical_features = X.select_dtypes(['float64', 'int64']).columns



In [4]:
num_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', MinMaxScaler())
    ]
)

cat_pipeline = Pipeline(
    steps=[
        ('ohe', OneHotEncoder())
    ]
)

preprocessor = ColumnTransformer([
    ('num_pipeline', num_pipeline, numerical_features),
    ('cat_pipeline', cat_pipeline, categorical_features)
])


In [5]:
X_train_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)

In [6]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y_train_scaled = encoder.fit_transform(y_train)
y_test_scaled = encoder.transform(y_test)

In [7]:
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train_scaled)
X_test_resampled, y_test_resampled = smote.fit_resample(X_test_scaled, y_test_scaled)



In [8]:
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_resampled)
X_test_pca = pca.transform(X_test_resampled)

In [9]:
lr = LogisticRegression()

lr.fit(X_train_pca, y_train_resampled)
y_pred = lr.predict(X_test_pca)

print("logistic regression model")
print("Classification Report:")
print(classification_report(y_test_resampled, y_pred))

logistic regression model
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.69      0.73    201985
           1       0.72      0.80      0.76    201985

    accuracy                           0.75    403970
   macro avg       0.75      0.75      0.74    403970
weighted avg       0.75      0.75      0.74    403970



In [10]:
dt = DecisionTreeClassifier()

dt.fit(X_train_pca, y_train_resampled)
y_pred = dt.predict(X_test_pca)

print("decision tree model")
print("Classification Report:")
print(classification_report(y_test_resampled, y_pred))

decision tree model
Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.91      0.80    201985
           1       0.88      0.64      0.74    201985

    accuracy                           0.78    403970
   macro avg       0.80      0.78      0.77    403970
weighted avg       0.80      0.78      0.77    403970

