In [95]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns



from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from xgboost import XGBClassifier

In [4]:
df = pd.read_csv("data\predictive_maintenance.csv")
df.head()

  df = pd.read_csv("data\predictive_maintenance.csv")


Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target
0,1,M14860,M,298.1,308.6,1551,42.8,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0


In [None]:
df.drop(['UDI', 'Product ID'], axis = 1, inplace= True)
df.head()

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target
0,M,298.1,308.6,1551,42.8,0,0
1,L,298.2,308.7,1408,46.3,3,0
2,L,298.1,308.5,1498,49.4,5,0
3,L,298.2,308.6,1433,39.5,7,0
4,L,298.2,308.7,1408,40.0,9,0


In [7]:
X = df.drop('Target', axis= 1)
y = df['Target']

In [9]:
X.shape, y.shape

((10000, 6), (10000,))

In [19]:
cal_cols = X.select_dtypes(object).columns
num_cols = X.select_dtypes('number').columns

In [20]:
scaler = MinMaxScaler()
ohe = OneHotEncoder()

preprocessor = ColumnTransformer(

[ 
       ("OneHotEncoder", ohe, cal_cols),
       ("MinMaxScaler", scaler, num_cols)
]
)

In [21]:
X = preprocessor.fit_transform(X)

In [24]:
X

array([[0.        , 0.        , 1.        , ..., 0.22293364, 0.53571429,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.13969732, 0.58379121,
        0.01185771],
       [0.        , 1.        , 0.        , ..., 0.19208382, 0.62637363,
        0.01976285],
       ...,
       [0.        , 0.        , 1.        , ..., 0.27764843, 0.40659341,
        0.08695652],
       [1.        , 0.        , 0.        , ..., 0.13969732, 0.61401099,
        0.09881423],
       [0.        , 0.        , 1.        , ..., 0.19324796, 0.5       ,
        0.11857708]], shape=(10000, 8))

In [25]:
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size= 0.3, random_state= 1, stratify= y)

In [29]:
lreg = LogisticRegression()
dtc  = DecisionTreeClassifier()
rfc  = RandomForestClassifier()

Base performance test

In [34]:
for i in [lreg, dtc, rfc]:
    i.fit(x_train, y_train)
    pred = i.predict(x_test)

    
    print("***********************")
    print(i)
    print(classification_report(y_test, pred))
    print(f1_score(y_test, pred))

***********************
LogisticRegression()
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      2898
           1       1.00      0.02      0.04       102

    accuracy                           0.97      3000
   macro avg       0.98      0.51      0.51      3000
weighted avg       0.97      0.97      0.95      3000

0.038461538461538464
***********************
DecisionTreeClassifier()
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2898
           1       0.61      0.64      0.62       102

    accuracy                           0.97      3000
   macro avg       0.80      0.81      0.81      3000
weighted avg       0.97      0.97      0.97      3000

0.625
***********************
RandomForestClassifier()
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      2898
           1       0.82      0.44      0.57       102

    accuracy

In [60]:
param_dist = {
    'n_estimators': [250, 300, 350],
    'max_depth': [15, 20, 25],
    'min_samples_split': [5, 7, 10],
    'class_weight': [ {0:1,1:3},{0:1,1:4}]
}

random_search = RandomizedSearchCV(
    estimator=rfc, 
    param_distributions=param_dist, 
    n_iter=20,  # Number of random combinations
    cv=5,  # 5-fold cross-validation
    scoring='f1', 
    random_state=42,
    n_jobs=-1,
    return_train_score= True
)


# Fit to data
random_search.fit(x_train, y_train)

# Best parameters
print("Best Parameters:", random_search.best_params_)

Best Parameters: {'n_estimators': 350, 'min_samples_split': 10, 'max_depth': 25, 'class_weight': {0: 1, 1: 3}}


In [61]:
pd.set_option('display.max_colwidth', None)
resutls_rfc = pd.DataFrame(random_search.cv_results_)
resutls_rfc[['params','mean_train_score','mean_test_score','rank_test_score']].sort_values(by = 'rank_test_score')

Unnamed: 0,params,mean_train_score,mean_test_score,rank_test_score
12,"{'n_estimators': 350, 'min_samples_split': 10, 'max_depth': 25, 'class_weight': {0: 1, 1: 3}}",0.928043,0.684508,1
13,"{'n_estimators': 250, 'min_samples_split': 10, 'max_depth': 15, 'class_weight': {0: 1, 1: 3}}",0.92594,0.67973,2
3,"{'n_estimators': 250, 'min_samples_split': 7, 'max_depth': 20, 'class_weight': {0: 1, 1: 3}}",0.953122,0.675719,3
15,"{'n_estimators': 300, 'min_samples_split': 7, 'max_depth': 15, 'class_weight': {0: 1, 1: 3}}",0.954606,0.675375,4
6,"{'n_estimators': 350, 'min_samples_split': 10, 'max_depth': 20, 'class_weight': {0: 1, 1: 3}}",0.926365,0.674659,5
1,"{'n_estimators': 300, 'min_samples_split': 7, 'max_depth': 25, 'class_weight': {0: 1, 1: 4}}",0.962627,0.674096,6
11,"{'n_estimators': 350, 'min_samples_split': 10, 'max_depth': 15, 'class_weight': {0: 1, 1: 3}}",0.930304,0.672675,7
9,"{'n_estimators': 350, 'min_samples_split': 7, 'max_depth': 15, 'class_weight': {0: 1, 1: 4}}",0.96103,0.671469,8
17,"{'n_estimators': 250, 'min_samples_split': 10, 'max_depth': 25, 'class_weight': {0: 1, 1: 3}}",0.929884,0.669151,9
7,"{'n_estimators': 300, 'min_samples_split': 10, 'max_depth': 25, 'class_weight': {0: 1, 1: 4}}",0.937814,0.668985,10


In [71]:
param_dist = {
    'splitter': ['best', 'random'],
    'max_depth': [10, 20, 30, 50],
    'min_samples_split': [2, 5, 10],
    'max_features': [ 'log2', None],
}


# Perform Randomized Search
random_search = RandomizedSearchCV(
    estimator=dtc,
    param_distributions=param_dist,
    n_iter=20,  # Number of random combinations
    cv=5,  # 5-fold cross-validation
    scoring='f1',
    random_state=42,
    n_jobs=-1,
    return_train_score= True
)

random_search.fit(x_train, y_train)


In [72]:
pd.set_option('display.max_colwidth', None)
resutls_dtc = pd.DataFrame(random_search.cv_results_)
resutls_dtc[['params','mean_train_score','mean_test_score','rank_test_score']].sort_values(by = 'rank_test_score')

Unnamed: 0,params,mean_train_score,mean_test_score,rank_test_score
10,"{'splitter': 'best', 'min_samples_split': 5, 'max_features': None, 'max_depth': 10}",0.918964,0.694234,1
12,"{'splitter': 'best', 'min_samples_split': 2, 'max_features': None, 'max_depth': 10}",0.938348,0.680403,2
2,"{'splitter': 'best', 'min_samples_split': 5, 'max_features': 'log2', 'max_depth': 30}",0.928053,0.631657,3
4,"{'splitter': 'best', 'min_samples_split': 2, 'max_features': 'log2', 'max_depth': 30}",1.0,0.631219,4
3,"{'splitter': 'random', 'min_samples_split': 2, 'max_features': None, 'max_depth': 50}",1.0,0.607701,5
17,"{'splitter': 'random', 'min_samples_split': 5, 'max_features': None, 'max_depth': 50}",0.875647,0.6068,6
8,"{'splitter': 'best', 'min_samples_split': 10, 'max_features': 'log2', 'max_depth': 10}",0.794492,0.595334,7
6,"{'splitter': 'best', 'min_samples_split': 2, 'max_features': 'log2', 'max_depth': 20}",1.0,0.591788,8
1,"{'splitter': 'best', 'min_samples_split': 10, 'max_features': 'log2', 'max_depth': 50}",0.858917,0.582193,9
14,"{'splitter': 'random', 'min_samples_split': 5, 'max_features': None, 'max_depth': 30}",0.862558,0.573882,10


Conclusion -> Sampling of data required due to heavy imbalance

In [73]:
from imblearn.over_sampling import SMOTE

sampler = SMOTE()

x_sampled, y_sampled = sampler.fit_resample(X,y)



In [74]:
y_sampled.value_counts()

Target
0    9661
1    9661
Name: count, dtype: int64

In [None]:
x_train_sampled, x_test_sampled, y_train_sampled, y_test_sampled = train_test_split(x_sampled, y_sampled, test_size = 0.3, random_state= 1, )

In [76]:
for i in [lreg, dtc, rfc]:
    i.fit(x_train_sampled, y_train_sampled)
    pred = i.predict(x_test_sampled)

    
    print("***********************")
    print(i)
    print(classification_report(y_test_sampled, pred))
    print(f1_score(y_test_sampled, pred))

***********************
LogisticRegression()
              precision    recall  f1-score   support

           0       0.83      0.82      0.82      2421
           1       0.82      0.83      0.82      2410

    accuracy                           0.82      4831
   macro avg       0.82      0.82      0.82      4831
weighted avg       0.82      0.82      0.82      4831

0.8236024844720496
***********************
DecisionTreeClassifier()
              precision    recall  f1-score   support

           0       0.97      0.96      0.96      2421
           1       0.96      0.97      0.96      2410

    accuracy                           0.96      4831
   macro avg       0.96      0.96      0.96      4831
weighted avg       0.96      0.96      0.96      4831

0.9621822690638562
***********************
RandomForestClassifier()
              precision    recall  f1-score   support

           0       0.98      0.97      0.98      2421
           1       0.97      0.98      0.98      2410

 

In [101]:
param_dist = {
    'criterion' : [ 'entropy', 'log_loss'],
    'n_estimators': [100, 200, 250, 500],
    'max_depth': [15, 17, 20],
    'min_samples_split': [2, 3],
}

random_search = GridSearchCV(
    estimator=rfc, 
    param_grid =param_dist,   # Number of random combinations
    cv=5,  # 5-fold cross-validation
    scoring='f1', 
    n_jobs=-1,
    return_train_score= True
)


# Fit to data
random_search.fit(x_sampled, y_sampled)

# Best parameters
print("Best Parameters:", random_search.best_params_)

Best Parameters: {'criterion': 'entropy', 'max_depth': 17, 'min_samples_split': 2, 'n_estimators': 100}


In [103]:
pd.set_option('display.max_colwidth', None)
resutls_rfc = pd.DataFrame(random_search.cv_results_)
resutls_rfc[['params','mean_train_score','mean_test_score','rank_test_score']].sort_values(by = 'rank_test_score')

Unnamed: 0,params,mean_train_score,mean_test_score,rank_test_score
8,"{'criterion': 'entropy', 'max_depth': 17, 'min_samples_split': 2, 'n_estimators': 100}",0.99978,0.938847,1
40,"{'criterion': 'log_loss', 'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 100}",0.999987,0.93849,2
21,"{'criterion': 'entropy', 'max_depth': 20, 'min_samples_split': 3, 'n_estimators': 200}",1.0,0.937587,3
18,"{'criterion': 'entropy', 'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 250}",1.0,0.936654,4
44,"{'criterion': 'log_loss', 'max_depth': 20, 'min_samples_split': 3, 'n_estimators': 100}",0.999922,0.936151,5
23,"{'criterion': 'entropy', 'max_depth': 20, 'min_samples_split': 3, 'n_estimators': 500}",1.0,0.935822,6
41,"{'criterion': 'log_loss', 'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 200}",1.0,0.935561,7
46,"{'criterion': 'log_loss', 'max_depth': 20, 'min_samples_split': 3, 'n_estimators': 250}",1.0,0.93537,8
12,"{'criterion': 'entropy', 'max_depth': 17, 'min_samples_split': 3, 'n_estimators': 100}",0.999664,0.935295,9
17,"{'criterion': 'entropy', 'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 200}",1.0,0.93529,10


In [99]:
xgb = XGBClassifier(verbosity = 0)


param_xgb = {'eta':[0.3,0.1,0.2,0.01],
             'max_depth':[3],
             'n_estimators':[100,250,500],
             'min_child_weight':[17,18],
             'reg_lamda':[0,0.5,1,2,3],
             'alpha':[0,0.5,1,2,3]
}


search_xgb = GridSearchCV(estimator = xgb,
                          param_grid = param_xgb,
                          scoring = 'f1',
                         return_train_score=True,
                          cv = 5)


search_xgb.fit(x_sampled, y_sampled)

In [100]:
pd.set_option('display.max_colwidth', None)
resutls_xgb = pd.DataFrame(search_xgb.cv_results_)
resutls_xgb[['params','mean_train_score','mean_test_score','rank_test_score']].sort_values(by = 'rank_test_score')

Unnamed: 0,params,mean_train_score,mean_test_score,rank_test_score
494,"{'alpha': 3, 'eta': 0.3, 'max_depth': 3, 'min_child_weight': 17, 'n_estimators': 500, 'reg_lamda': 3}",0.986136,0.926286,1
492,"{'alpha': 3, 'eta': 0.3, 'max_depth': 3, 'min_child_weight': 17, 'n_estimators': 500, 'reg_lamda': 1}",0.986136,0.926286,1
490,"{'alpha': 3, 'eta': 0.3, 'max_depth': 3, 'min_child_weight': 17, 'n_estimators': 500, 'reg_lamda': 0}",0.986136,0.926286,1
491,"{'alpha': 3, 'eta': 0.3, 'max_depth': 3, 'min_child_weight': 17, 'n_estimators': 500, 'reg_lamda': 0.5}",0.986136,0.926286,1
493,"{'alpha': 3, 'eta': 0.3, 'max_depth': 3, 'min_child_weight': 17, 'n_estimators': 500, 'reg_lamda': 2}",0.986136,0.926286,1
...,...,...,...,...
544,"{'alpha': 3, 'eta': 0.2, 'max_depth': 3, 'min_child_weight': 17, 'n_estimators': 100, 'reg_lamda': 3}",0.957055,0.898442,596
540,"{'alpha': 3, 'eta': 0.2, 'max_depth': 3, 'min_child_weight': 17, 'n_estimators': 100, 'reg_lamda': 0}",0.957055,0.898442,596
541,"{'alpha': 3, 'eta': 0.2, 'max_depth': 3, 'min_child_weight': 17, 'n_estimators': 100, 'reg_lamda': 0.5}",0.957055,0.898442,596
543,"{'alpha': 3, 'eta': 0.2, 'max_depth': 3, 'min_child_weight': 17, 'n_estimators': 100, 'reg_lamda': 2}",0.957055,0.898442,596
