In [128]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import TruncatedSVD
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from scipy.stats import yeojohnson
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier
import shap
from scipy.stats import randint

In [129]:
parts_data = pd.read_csv('/Users/skylerwilson/Desktop/Lighthouse_Labs/Projects/final_project/data/Project_Data/final_parts_data.csv')

In [130]:
# Clean sales data function
def clean_sales_data(df, column_names):
    for col in column_names:  
        df[col] = np.abs(df[col])
    return df

# Clean quantity data function
def clean_quantity_data(df, quantity_col):
    return df[df[quantity_col] > 0]

# Clean turnover data function
def clean_negative_data(df, turnover_col):
    return df[df[turnover_col] >= 0]


def z_score(column, threshold=2):
    z_scores = (column - column.mean()) / column.std()
    return np.abs(z_scores) < threshold

sales_data = ['Sales Last Month', 'Sales Last 3 Months', 'Sales Last 6 Months', 'Sales Last 9 Months',
              'Sales Last 12 Months', 'Sales Last 2 Years', 'Sales Last 3 Years',
              'Sales Last 4 Years', 'Sales Last 5 Years', 'Sales Last 10 Years',
              'Months No Sale', 'Reorder Point', 'Sales - Jan', 'Sales - Feb',
              'Sales - Mar', 'Sales - Apr', 'Sales - May', 'Sales - Jun',
              'Sales - Jul', 'Sales - Aug', 'Sales - Sep', 'Sales - Oct',
              'Sales - Nov', 'Sales - Dec', 'Sales - 1st Qtr', 'Sales - 2nd Qtr',
              'Sales - 3rd Qtr', 'Sales - 4th Qtr', 'Sales - This Year','Sales - Last Year']
quantity_col = 'Quantity'
turnover_col = 'Turnover'

num_cols = parts_data.select_dtypes(include='number').columns

# Apply preprocessing steps
parts_data = clean_sales_data(parts_data, sales_data)
parts_data = clean_quantity_data(parts_data, quantity_col)
parts_data = clean_negative_data(parts_data, turnover_col)


# Apply z-score transformation to numerical columns
parts_data[num_cols] = parts_data[parts_data[num_cols].apply(z_score)][num_cols]

#deals with columns that dont have data yet so they end up as NaN when they shouldnt
parts_data[num_cols] = np.where(parts_data[num_cols].isna(), 0, parts_data[num_cols])

In [131]:
# Custom transformer classes
class YeoJohnsonTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        transformed_data = X.copy()
        for col in self.columns:
            transformed_data[col], _ = yeojohnson(X[col] + 0.01)  # Adding 0.01 to avoid zero values
        return transformed_data

# Load and preprocess the data
y = parts_data['Obsolete']
X = parts_data.select_dtypes(include='number').drop(columns=['Obsolete'])
constant_columns = X.columns[X.nunique() == 1]
X.drop(columns=constant_columns, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

# Define columns to transform
cols_to_transform = X.columns

# Column transformer for Yeo-Johnson transformation
yeo_johnson_cols = ColumnTransformer(
    transformers=[('yeo_johnson', YeoJohnsonTransformer(columns=cols_to_transform), cols_to_transform)],
    remainder='passthrough'
)

# Create FeatureUnion combining TruncatedSVD and SelectKBest
features = FeatureUnion([
    ('svd', TruncatedSVD(n_components=5)),
    ('select_k_best', SelectKBest(score_func=f_regression, k=7))
])

final_pipeline = Pipeline([
    ('transformer', yeo_johnson_cols), 
    ('features', features),
    ('scaler', RobustScaler()),
    ('classifier', XGBClassifier(
        colsample_bytree=0.6,
        gamma=1.0,
        learning_rate=0.3,
        max_depth=6,
        min_child_weight=3,
        n_estimators=242,
        reg_alpha=0.3,
        reg_lambda=0.1,
        subsample=0.7
    )) 
])

final_pipeline.fit(X_train, y_train)

X_train_transformed = final_pipeline.named_steps['transformer'].transform(X_train)
X_train_transformed = final_pipeline.named_steps['features'].transform(X_train_transformed)
X_train_transformed = final_pipeline.named_steps['scaler'].transform(X_train_transformed)

# Transform the testing data using the pipeline
X_test_transformed = final_pipeline.named_steps['transformer'].transform(X_test)
X_test_transformed = final_pipeline.named_steps['features'].transform(X_test_transformed)
X_test_transformed = final_pipeline.named_steps['scaler'].transform(X_test_transformed)

y_pred = final_pipeline.named_steps['classifier'].predict(X_test_transformed)
y_prob = final_pipeline.named_steps['classifier'].predict_proba(X_test_transformed)

In [132]:
print(f'Accuracy Score: {accuracy_score(y_test, y_pred)}')
report = classification_report(y_test, y_pred)
print('Classification Report:')
print(report)

Accuracy Score: 0.9966378482228626
Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1550
         1.0       0.99      1.00      0.99       532

    accuracy                           1.00      2082
   macro avg       0.99      1.00      1.00      2082
weighted avg       1.00      1.00      1.00      2082



In [133]:
prediction_data = pd.DataFrame({
    'PartIndex': range(len(X_test)),
    'Obsolete Prediction': y_pred,
    'Obsolete Probability %': y_prob[:, 1]* 100
})

# Merge data using the PartIndex as the key
merged_data = prediction_data.merge(
    parts_data[['Part Number', 'Supplier Name', 'Description','Price', 'Quantity','Margin %', 'Total Cost','Turnover', 'EOQ', 'Customer LTV']],
    left_on='PartIndex',
    right_index=True,
    how='left'
)

# Drop rows with missing data
obsolete_parts_with_probabilities = merged_data.dropna(subset=['Supplier Name', 'Description', 'Obsolete Probability %'])

In [134]:
obsolete_parts_with_probabilities.drop(columns=['PartIndex'])

Unnamed: 0,Obsolete Prediction,Obsolete Probability %,Part Number,Supplier Name,Description,Price,Quantity,Margin %,Total Cost,Turnover,EOQ,Customer LTV
21,0,0.120475,004-153,moto international,bearing ntn 6203llu/2a 40x17x12,6.99,1.0,35.050072,4.54,0.000000,1.345848e-01,0.000000
27,0,0.122761,004HF113,thibault canada,hi-flo o-filt hon 15412-hm5-a1,4.99,2.0,35.070140,6.48,0.000000,6.103096e-04,0.000000
63,0,0.064274,0069922BC,thibault canada,new style universal cruise ctr,18.99,2.0,50.026330,18.98,0.222118,6.707854e-01,0.000000
64,0,0.027864,0069925B,thibault canada,"universal cruise control 1""",18.99,1.0,50.026330,9.49,0.399831,1.534614e+00,3332.813198
132,0,0.010503,01-0140,mc distributing,valve stem seal,5.99,8.0,46.243740,25.76,0.000000,1.456870e-23,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
2004,0,0.018418,04-195,moto international,bulb 12v-45/45w h4 p45t clear,5.99,5.0,56.761269,12.95,0.000000,8.723314e-03,0.000000
2009,0,0.096305,04-2157,moto international,bay15d 12v 32-4cp 1157 white led,14.99,2.0,0.000000,29.88,0.000000,2.299728e-03,3351.242434
2036,0,0.108083,04-4724,moto international,bulb 12v 21/6cp 1176 ba15d,1.99,10.0,51.758794,9.60,0.000000,1.283121e-13,0.000000
2060,0,0.056998,04-614,moto international,"bulb 12v-30w 4.5"" 4416 light",17.99,1.0,36.964981,11.34,0.000000,7.226540e-05,3333.628989


In [139]:
obsolete_parts_with_probabilities.sort_values(by=['Turnover'], ascending=False)

Unnamed: 0,PartIndex,Obsolete Prediction,Obsolete Probability %,Part Number,Supplier Name,Description,Price,Quantity,Margin %,Total Cost,Turnover,EOQ,Customer LTV
1860,1860,0,0.089063,0350-0210,parts canada/power twins,tube me-g18 150 to 180-18 tr-4,64.99,1.0,35.036159,42.220,0.490360,0.687190,3363.620473
64,64,0,0.027864,0069925B,thibault canada,"universal cruise control 1""",18.99,1.0,50.026330,9.490,0.399831,1.534614,3332.813198
1072,1072,0,0.078274,0302-1258,parts canada/power twins,150/80r16 71v av92 cobra chrom,552.99,1.0,51.252283,269.570,0.391945,0.901376,3579.848573
1625,1625,0,0.058703,0316-0359,parts canada/power twins,100/90-19 57v tl/tt anakee adv,350.99,1.0,51.255022,171.090,0.391927,1.758260,3486.500570
1488,1488,0,0.061184,0306-0784,parts canada/power twins,140/80-17 69v battlax bt46 rea,305.99,1.0,53.964182,140.865,0.374220,0.497024,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1638,1638,0,0.046966,0316-0445,parts canada/power twins,90/90-21 54h karoo street fron,217.99,1.0,35.010780,141.670,0.000000,0.286656,0.000000
27,27,0,0.122761,004HF113,thibault canada,hi-flo o-filt hon 15412-hm5-a1,4.99,2.0,35.070140,6.480,0.000000,0.000610,0.000000
1762,1762,0,0.507226,0317-0487,parts canada/power twins,150/70b18 70q adventurecross a,399.99,1.0,51.363784,194.540,0.000000,0.103579,3507.688404
1770,1770,1,99.973946,0317-0503,parts canada/power twins,130/90-16 73h k761 dual sport,234.99,2.0,59.347206,191.060,0.000000,0.230927,0.000000
