## Data preparation for classification models

In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder , label_binarize
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, mean_squared_error, r2_score



In [2]:
df = pd.read_csv('Steel_industry_cleaned_data.csv')
df

Unnamed: 0.1,Unnamed: 0,date,Usage_kWh,Lagging_Current_Reactive.Power_kVarh,Leading_Current_Reactive_Power_kVarh,CO2(tCO2),Lagging_Current_Power_Factor,NSM,WeekStatus,Day_of_week,Load_Type
0,0,2018-01-01 00:15:00,3.17,2.95,0.00,0.0,73.21,900,Weekday,Monday,Light_Load
1,1,2018-01-01 00:30:00,4.00,4.46,0.00,0.0,66.77,1800,Weekday,Monday,Light_Load
2,2,2018-01-01 00:45:00,3.24,3.28,0.00,0.0,70.28,2700,Weekday,Monday,Light_Load
3,3,2018-01-01 01:00:00,3.31,3.56,0.00,0.0,68.09,3600,Weekday,Monday,Light_Load
4,4,2018-01-01 01:15:00,3.82,4.50,0.00,0.0,64.72,4500,Weekday,Monday,Light_Load
...,...,...,...,...,...,...,...,...,...,...,...
35035,35035,2018-12-31 23:00:00,3.85,4.86,0.00,0.0,62.10,82800,Weekday,Monday,Light_Load
35036,35036,2018-12-31 23:15:00,3.74,3.74,0.00,0.0,70.71,83700,Weekday,Monday,Light_Load
35037,35037,2018-12-31 23:30:00,3.78,3.17,0.07,0.0,76.62,84600,Weekday,Monday,Light_Load
35038,35038,2018-12-31 23:45:00,3.78,3.06,0.11,0.0,77.72,85500,Weekday,Monday,Light_Load


In [3]:
## will drop weekstatus as it seems there is a big difference between Saturday and Sunday also

df = df.drop(columns = ['Unnamed: 0' ,'WeekStatus' ])
df

Unnamed: 0,date,Usage_kWh,Lagging_Current_Reactive.Power_kVarh,Leading_Current_Reactive_Power_kVarh,CO2(tCO2),Lagging_Current_Power_Factor,NSM,Day_of_week,Load_Type
0,2018-01-01 00:15:00,3.17,2.95,0.00,0.0,73.21,900,Monday,Light_Load
1,2018-01-01 00:30:00,4.00,4.46,0.00,0.0,66.77,1800,Monday,Light_Load
2,2018-01-01 00:45:00,3.24,3.28,0.00,0.0,70.28,2700,Monday,Light_Load
3,2018-01-01 01:00:00,3.31,3.56,0.00,0.0,68.09,3600,Monday,Light_Load
4,2018-01-01 01:15:00,3.82,4.50,0.00,0.0,64.72,4500,Monday,Light_Load
...,...,...,...,...,...,...,...,...,...
35035,2018-12-31 23:00:00,3.85,4.86,0.00,0.0,62.10,82800,Monday,Light_Load
35036,2018-12-31 23:15:00,3.74,3.74,0.00,0.0,70.71,83700,Monday,Light_Load
35037,2018-12-31 23:30:00,3.78,3.17,0.07,0.0,76.62,84600,Monday,Light_Load
35038,2018-12-31 23:45:00,3.78,3.06,0.11,0.0,77.72,85500,Monday,Light_Load


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35040 entries, 0 to 35039
Data columns (total 9 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   date                                  35040 non-null  object 
 1   Usage_kWh                             35040 non-null  float64
 2   Lagging_Current_Reactive.Power_kVarh  35040 non-null  float64
 3   Leading_Current_Reactive_Power_kVarh  35040 non-null  float64
 4   CO2(tCO2)                             35040 non-null  float64
 5   Lagging_Current_Power_Factor          35040 non-null  float64
 6   NSM                                   35040 non-null  int64  
 7   Day_of_week                           35040 non-null  object 
 8   Load_Type                             35040 non-null  object 
dtypes: float64(5), int64(1), object(3)
memory usage: 2.4+ MB


In [5]:
X = df.drop(columns = ['date' ,'Load_Type'])
y = df['Load_Type']

X

Unnamed: 0,Usage_kWh,Lagging_Current_Reactive.Power_kVarh,Leading_Current_Reactive_Power_kVarh,CO2(tCO2),Lagging_Current_Power_Factor,NSM,Day_of_week
0,3.17,2.95,0.00,0.0,73.21,900,Monday
1,4.00,4.46,0.00,0.0,66.77,1800,Monday
2,3.24,3.28,0.00,0.0,70.28,2700,Monday
3,3.31,3.56,0.00,0.0,68.09,3600,Monday
4,3.82,4.50,0.00,0.0,64.72,4500,Monday
...,...,...,...,...,...,...,...
35035,3.85,4.86,0.00,0.0,62.10,82800,Monday
35036,3.74,3.74,0.00,0.0,70.71,83700,Monday
35037,3.78,3.17,0.07,0.0,76.62,84600,Monday
35038,3.78,3.06,0.11,0.0,77.72,85500,Monday


In [6]:
y.value_counts()

Load_Type
Light_Load      18072
Medium_Load      9696
Maximum_Load     7272
Name: count, dtype: int64

In [7]:
## encode categorical data , 'Load_Type' will be the column to be classified

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)


ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [6])], remainder='passthrough')

X = np.array(ct.fit_transform(X))



In [9]:
pd.Series(y).value_counts()

0    18072
2     9696
1     7272
Name: count, dtype: int64

In [8]:
## split the dataset

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_train

array([[0.000e+00, 1.000e+00, 0.000e+00, ..., 0.000e+00, 4.887e+01,
        8.190e+04],
       [1.000e+00, 0.000e+00, 0.000e+00, ..., 4.000e-02, 9.090e+01,
        7.200e+04],
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 7.505e+01,
        8.460e+04],
       ...,
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 2.000e-02, 8.602e+01,
        6.030e+04],
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 7.071e+01,
        0.000e+00],
       [0.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 5.050e+01,
        2.070e+04]])

## Classification model training with grid search to find best model and params

### Most accurate way but needs a lot of computing power

In [None]:
models = [
    ("Logistic Regression", LogisticRegression(max_iter=1000), {
        'C': [0.01, 0.1, 1, 10],   # C is regularization strength
        'solver': ['liblinear', 'saga'],
        'penalty': ['l2', 'l1']
    }),
    ("KNN", KNeighborsClassifier(), {
        'n_neighbors': [3, 5, 7, 10],
        'metric': ['euclidean', 'manhattan']
    }),
    ("Decision Tree", DecisionTreeClassifier(), {
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],  #The minimum number of samples required to split an internal node.
        'min_samples_leaf': [1, 2, 4]  #The minimum number of samples required to be at a leaf node
    }),
    ("Naive Bayes", GaussianNB(), {}),
    ("Random Forest", RandomForestClassifier(), {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
        'bootstrap': [True, False]  #If True, each tree is trained on a random subset of the data with replacement
    }),
    ("Random Forest with Extra Trees", ExtraTreesClassifier(), {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
        'bootstrap': [True, False]    
    }),
    ("AdaBoost", AdaBoostClassifier(), {
        'n_estimators': [50, 100, 150],  
        'learning_rate': [0.01, 0.1, 1]  #Controls the contribution of each base model to the final prediction.
    }),
    ("XGBoost", XGBClassifier(), {
        'learning_rate': [0.01, 0.1, 0.3],
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 6, 10]
    }),
    ("SVM", SVC(), {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto']  #Kernel coefficient
    })
]

results = []

for name, model, param_grid in models:
    grid_search = GridSearchCV(estimator= model , param_grid= param_grid , cv = 10 , n_jobs=-1, verbose=1)
    grid_search.fit(X_train , y_train)

    best_model = grid_search.best_estimator_

    y_pred = best_model.predict(X_test)
    
    accuracy = accuracy_score(y_test , y_pred)
    report = classification_report(y_test, y_pred, output_dict= True)

    results.append({
        'Model': name,
        'Best Params': grid_search.best_params_,
        'Accuracy': accuracy,
        'Precision': report['macro avg']['precision'],
        'Recall': report['macro avg']['recall'],
        'F1 Score': report['macro avg']['f1-score']
    })

results_df = pd.DataFrame(results)

results

### Compare models with not so many parameters, than fine tune the best one with Grid Search

In [9]:
models_initial = [
    ("SVM", SVC(kernel='rbf')),
    ("Logistic Regression", LogisticRegression(max_iter=1000)),
    ("KNN", KNeighborsClassifier(n_neighbors=5)),
    ("Decision Tree", DecisionTreeClassifier(max_depth=10)),
    ("Naive Bayes", GaussianNB()),
    ("Random Forest", RandomForestClassifier(n_estimators=100)),
    ("Random Forest with Extra Trees", ExtraTreesClassifier(n_estimators=100)),
    ("AdaBoost", AdaBoostClassifier(n_estimators=100)),
    ("XGBoost", XGBClassifier(n_estimators=100))
    
]

results = []


for name, model in models_initial:
    
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)

    
    results.append({
        'Model': name,
        'Accuracy': accuracy,
        'Precision': report['macro avg']['precision'],
        'Recall': report['macro avg']['recall'],
        'F1 Score': report['macro avg']['f1-score']
    })


df = pd.DataFrame(results)


df


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,SVM,0.735445,0.697313,0.716853,0.700984
1,Logistic Regression,0.742865,0.689791,0.690657,0.689953
2,KNN,0.876712,0.844757,0.845652,0.845161
3,Decision Tree,0.893408,0.863415,0.86822,0.865424
4,Naive Bayes,0.706478,0.663639,0.688193,0.664682
5,Random Forest,0.909532,0.883058,0.884844,0.883931
6,Random Forest with Extra Trees,0.896404,0.867433,0.869453,0.868388
7,AdaBoost,0.856735,0.824884,0.830376,0.81652
8,XGBoost,0.906821,0.878107,0.882941,0.880164


In [12]:
df = df.sort_values(by = ['Accuracy'] , ascending = False)

fig = px.bar(df, x='Model', y='Accuracy', 
             labels={'Model': 'Classifier', 'Accuracy': 'Accuracy'},
             title='Classifier Accuracies')

fig.show()

## Use grid search for Random Forrest and XGBoost as they were very close to each other

In [None]:
models = [
    ("Random Forest", RandomForestClassifier(), {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
        'bootstrap': [True, False]
    }),
    
    ("XGBoost", XGBClassifier(), {
        'learning_rate': [0.01, 0.1, 0.3],
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 6, 10]
    })
]

results = []

for name, model, param_grid in models:
    grid_search = GridSearchCV(estimator= model , param_grid= param_grid , cv = 10 , n_jobs=-1, verbose=1)
    grid_search.fit(X_train , y_train)

    best_model = grid_search.best_estimator_

    y_pred = best_model.predict(X_test)
    
    accuracy = accuracy_score(y_test , y_pred)
    report = classification_report(y_test, y_pred, output_dict= True)

    results.append({
        'Model': name,
        'Best Params': grid_search.best_params_,
        'Accuracy': accuracy,
        'Precision': report['macro avg']['precision'],
        'Recall': report['macro avg']['recall'],
        'F1 Score': report['macro avg']['f1-score']
    })

    print(f'{model} is trained.')

results_df = pd.DataFrame(results)

results_df


Fitting 10 folds for each of 72 candidates, totalling 720 fits
RandomForestClassifier() is trained.
Fitting 10 folds for each of 27 candidates, totalling 270 fits
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...) is trained.


[{'Model': 'Random Forest',
  'Best Params': {'bootstrap': True,
   'max_depth': 20,
   'min_samples_leaf': 1,
   'min_samples_split': 5,
   'n_estimators': 200},
  'Accuracy': 0.9093892694063926,
  'Precision': 0.8824396682885413,
  'Recall': 0.8849838083270228,
  'F1 Score': 0.8836045472087005},
 {'Model': 'XGBoost',
  'Best Params': {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 100},
  'Accuracy': 0.9099600456621004,
  'Precision': 0.8829516795771811,
  'Recall': 0.8875482411971721,
  'F1 Score': 0.8848903448468105}]

In [21]:
results_df = results_df.sort_values(by = ['Accuracy'] , ascending = False)

fig = px.bar(results_df, x='Model', y='Accuracy', 
             labels={'Model': 'Classifier', 'Accuracy': 'Accuracy'},
             title='Classifier Accuracies')

fig.show()

In [22]:
results_df

Unnamed: 0,Model,Best Params,Accuracy,Precision,Recall,F1 Score
1,XGBoost,"{'learning_rate': 0.1, 'max_depth': 10, 'n_est...",0.90996,0.882952,0.887548,0.88489
0,Random Forest,"{'bootstrap': True, 'max_depth': 20, 'min_samp...",0.909389,0.88244,0.884984,0.883605


## Create best model and test it

In [None]:
classifier = best_model

In [None]:

y_pred = best_model.predict(X_test)

y_test_binarized = label_binarize(y_test, classes=[0, 1, 2])
n_classes = y_test_binarized.shape[1]

y_pred_reshaped = y_pred.reshape(-1, 1)

class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

roc_auc = roc_auc_score(y_test_binarized, y_pred_reshaped)
print("ROC AUC Score:", roc_auc)




Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.97      0.98      3615
           1       0.81      0.86      0.84      1454
           2       0.86      0.83      0.84      1939

    accuracy                           0.91      7008
   macro avg       0.88      0.89      0.88      7008
weighted avg       0.91      0.91      0.91      7008

Confusion Matrix:
 [[3521   17   77]
 [  12 1254  188]
 [  63  274 1602]]
ROC AUC Score: 0.027609238612243313
Mean Squared Error: 0.1499714611872146
R2 Score: 0.7960480990797447
