# Featurisation & Model Tuning

In [1]:
# Libraries for reading and manipulating data
import numpy as np
import pandas as pd

# Libraries for data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Libraries for model management
import pickle
from sklearn.pipeline import Pipeline

# Libraries for data-preprocessing, model building and evaluation
from sklearn import metrics
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import (KFold, 
                                     LeaveOneOut, 
                                     StratifiedKFold, 
                                     cross_val_score, 
                                     train_test_split, 
                                     GridSearchCV, 
                                     RandomizedSearchCV)

# Classifier models
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier 

# Library to over-sample a given data
from imblearn.over_sampling import SMOTE

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [2]:
def display_null_count(df):
    for attr in df.columns:
        print('-'*50)
        print(f'{attr}')
        print('-'*50)
        print('Null entry count:', df[attr].isnull().sum())
        print('Null entry proportion:', round(100 * df[attr].isnull().sum() / df.shape[0], 2), '%\n')

def get_classification_report(actuals, predictions, labels):
    """
    Function to generate classification report and confusion matrix.
    """
    print('Classification report')
    print('`'*50)
    print(metrics.classification_report(actuals, predictions, labels=labels))

    cm = metrics.confusion_matrix(actuals, predictions, labels=labels)
    cm_df = pd.DataFrame(cm, index=labels, columns=labels)
    plt.figure(figsize=(3, 2))
    sns.heatmap(cm_df, annot=True, fmt='g')
    plt.title('Confusion matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual');

# Understanding the Data

In [4]:
# Reading 'signal-data.csv'
signal_df = pd.read_csv('./semi.csv')
signal_df.head()

Unnamed: 0,Time,0,1,2,3,4,5,6,7,8,...,581,582,583,584,585,586,587,588,589,Pass/Fail
0,2008-07-19 11:55:00,3030.93,2564.0,2187.733,1411.127,1.36,100.0,97.613,0.124,1.5,...,,0.5,0.012,0.004,2.363,,,,,-1
1,2008-07-19 12:32:00,3095.78,2465.14,2230.422,1463.661,0.829,100.0,102.343,0.125,1.497,...,208.204,0.502,0.022,0.005,4.445,0.01,0.02,0.006,208.204,-1
2,2008-07-19 13:17:00,2932.61,2559.94,2186.411,1698.017,1.51,100.0,95.488,0.124,1.444,...,82.86,0.496,0.016,0.004,3.175,0.058,0.048,0.015,82.86,1
3,2008-07-19 14:43:00,2988.72,2479.9,2199.033,909.793,1.32,100.0,104.237,0.122,1.488,...,73.843,0.499,0.01,0.003,2.054,0.02,0.015,0.004,73.843,-1
4,2008-07-19 15:22:00,3032.24,2502.87,2233.367,1326.52,1.533,100.0,100.397,0.123,1.503,...,,0.48,0.477,0.104,99.303,0.02,0.015,0.004,73.843,-1


In [5]:
# Dataset info
signal_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1567 entries, 0 to 1566
Columns: 592 entries, Time to Pass/Fail
dtypes: float64(590), int64(1), object(1)
memory usage: 7.1+ MB


In [6]:
# Five-point summary
signal_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,1561.000,3014.453,73.622,2743.240,2966.260,3011.490,3056.650,3356.350
1,1560.000,2495.850,80.408,2158.750,2452.248,2499.405,2538.823,2846.440
2,1553.000,2200.547,29.513,2060.660,2181.044,2201.067,2218.055,2315.267
3,1553.000,1396.377,441.692,0.000,1081.876,1285.214,1591.224,3715.042
4,1553.000,4.197,56.356,0.681,1.018,1.317,1.526,1114.537
...,...,...,...,...,...,...,...,...
586,1566.000,0.021,0.012,-0.017,0.013,0.021,0.028,0.103
587,1566.000,0.016,0.009,0.003,0.011,0.015,0.020,0.080
588,1566.000,0.005,0.003,0.001,0.003,0.005,0.006,0.029
589,1566.000,99.670,93.892,0.000,44.369,71.900,114.750,737.305


***Insights***
* The dataset contains a large feature set describing production entities with 590 numerical attributes, 1 date-time attribute and 1 target attribute.
* All the numerical features are denoted by index values lacking the nature and the unit of their measurements.
* The dataset is imbalanced as at least 75% of the records represent a passed yield against the minority that represent the failed yields.

# Data Cleansing

In [7]:
# Null entry count in non-numeric attributes
display_null_count(signal_df[['Time', 'Pass/Fail']])

--------------------------------------------------
Time
--------------------------------------------------
Null entry count: 0
Null entry proportion: 0.0 %

--------------------------------------------------
Pass/Fail
--------------------------------------------------
Null entry count: 0
Null entry proportion: 0.0 %



In [8]:
# Removing features with more than 20% null entries & Imputing the remaining null entries with attribute mean values
for attr in signal_df.select_dtypes(include=['float64']).columns:
    null_proportion = (signal_df[attr].isnull().sum() / signal_df.shape[0])
    if null_proportion > 0.2:
        signal_df.drop(attr, axis=1, inplace=True)
    elif null_proportion:
        signal_df[attr].fillna(signal_df[attr].mean(), inplace=True)

signal_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1567 entries, 0 to 1566
Columns: 560 entries, Time to Pass/Fail
dtypes: float64(558), int64(1), object(1)
memory usage: 6.7+ MB


In [9]:
# Features with zero standard deviation
std_0_cols = [col for col in signal_df.select_dtypes(include=['float64']).columns if signal_df[col].std() == 0]
signal_df[std_0_cols].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
5,1567.000,100.000,0.000,100.000,100.000,100.000,100.000,100.000
13,1567.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
42,1567.000,70.000,0.000,70.000,70.000,70.000,70.000,70.000
49,1567.000,1.000,0.000,1.000,1.000,1.000,1.000,1.000
52,1567.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
...,...,...,...,...,...,...,...,...
534,1567.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
535,1567.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
536,1567.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
537,1567.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000


In [10]:
# Dropping 116 attributes that have no variation in their data
signal_df.drop(std_0_cols, axis=1, inplace=True)
signal_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1567 entries, 0 to 1566
Columns: 444 entries, Time to Pass/Fail
dtypes: float64(442), int64(1), object(1)
memory usage: 5.3+ MB


In [11]:
# Setting the data-type of the 'Time' attribute as datetime
signal_df['Time'] = pd.to_datetime(signal_df['Time'])
signal_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1567 entries, 0 to 1566
Columns: 444 entries, Time to Pass/Fail
dtypes: datetime64[ns](1), float64(442), int64(1)
memory usage: 5.3 MB


In [12]:
# Attributes with very weak(<0.05) correlation with the target variable 'Pass/Fail'
target_corr = abs(signal_df.corr()['Pass/Fail'])
weak_corr_features = target_corr[target_corr < 0.05]
weak_corr_features

Time   0.020
0      0.025
1      0.003
2      0.001
3      0.025
        ... 
585    0.005
586    0.004
587    0.035
588    0.031
589    0.003
Name: Pass/Fail, Length: 360, dtype: float64

Dropping all the attributes which have very weak correlation (correlation co-efficient < 0.05) with the target attribute 'Pass/Fail' as their influence on the target varaible is insignificant.

In [13]:
signal_df.drop(list(weak_corr_features.index), axis=1, inplace=True)
signal_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1567 entries, 0 to 1566
Data columns (total 84 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   14         1567 non-null   float64
 1   21         1567 non-null   float64
 2   22         1567 non-null   float64
 3   26         1567 non-null   float64
 4   28         1567 non-null   float64
 5   32         1567 non-null   float64
 6   33         1567 non-null   float64
 7   38         1567 non-null   float64
 8   40         1567 non-null   float64
 9   56         1567 non-null   float64
 10  58         1567 non-null   float64
 11  59         1567 non-null   float64
 12  63         1567 non-null   float64
 13  64         1567 non-null   float64
 14  65         1567 non-null   float64
 15  68         1567 non-null   float64
 16  70         1567 non-null   float64
 17  76         1567 non-null   float64
 18  79         1567 non-null   float64
 19  90         1567 non-null   float64
 20  95      

In [14]:
# Attributes with high correlation(>0.95)
corr_attrs = list()
signal_df_corr = signal_df.select_dtypes(include=['float64']).corr()
for i in range(len(signal_df_corr.columns)):
    for j in range(i+1, len(signal_df_corr.columns)):
        if abs(signal_df_corr.iloc[i, j]) > 0.95:
            corr_attrs.append(signal_df_corr.columns[j])
corr_attrs = list(set(corr_attrs))
corr_attrs

['554',
 '319',
 '127',
 '452',
 '435',
 '298',
 '295',
 '557',
 '387',
 '575',
 '469',
 '436',
 '249',
 '437',
 '299',
 '294',
 '165',
 '455',
 '477',
 '300']

In [15]:
# Dropping one of the attributes from the highly correlated attribute pairs
signal_df.drop(corr_attrs, axis=1, inplace=True)
signal_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1567 entries, 0 to 1566
Data columns (total 64 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   14         1567 non-null   float64
 1   21         1567 non-null   float64
 2   22         1567 non-null   float64
 3   26         1567 non-null   float64
 4   28         1567 non-null   float64
 5   32         1567 non-null   float64
 6   33         1567 non-null   float64
 7   38         1567 non-null   float64
 8   40         1567 non-null   float64
 9   56         1567 non-null   float64
 10  58         1567 non-null   float64
 11  59         1567 non-null   float64
 12  63         1567 non-null   float64
 13  64         1567 non-null   float64
 14  65         1567 non-null   float64
 15  68         1567 non-null   float64
 16  70         1567 non-null   float64
 17  76         1567 non-null   float64
 18  79         1567 non-null   float64
 19  90         1567 non-null   float64
 20  95      

In [16]:
# Check for duplicated records
signal_df.duplicated().sum()

np.int64(0)

* The number of features has been reduced from 591 to 63 using using appropriate feature engineering techniques.
* All the features that contained more than 20% of null data has been dropped since with so many missing entries the attributes are less likely to convey much information.
* Missing records in the remaining attributes are replaced with the attribute data mean.
* Attributes that have constant data throughout the records have also been removed as they depict no variation and do not contribute towards predicting the target value.
* Features that have very low correlation with the target variable have been dropped since they have very insignificant impact in determining the target value.
* One of the attributes from each of the highly correlated pairs of features have also been dropped since the variation contributed by both the features in such a pair would be the same. 
* The reduction in the number of features would significantly reduce the computational complexity of the model.
* PCA could help us further reduce the number of attributes for a much lighter and efficient model.

# Data Analysis & Visualisation

In [17]:
# Histogram
attrs = signal_df.columns
figure, axes = plt.subplots(nrows=32, ncols=2, figsize=(10, 160))
for i in range(0, 64, 2):
    for j in range(2):
        sns.histplot(signal_df[attrs[(i+1) if j else i]], ax=axes[int(i/2)][j])
        axes[int(i/2)][j].title.set_text(f'Attribute: {attrs[(i+1) if j else i]}')

In [18]:
# Boxplot
attrs = signal_df.columns
figure, axes = plt.subplots(nrows=32, ncols=2, figsize=(10, 160))
for i in range(0, 64, 2):
    for j in range(2):
        sns.boxplot(signal_df[attrs[(i+1) if j else i]], ax=axes[int(i/2)][j])
        axes[int(i/2)][j].title.set_text(f'Attribute: {attrs[(i+1) if j else i]}')

***Insights***
* Most of the attributes have skewed data with some attributes like '63', '114', '159', '210' etc., exhibiting high degree of right skewness and attributes like '22', '40', '56', '68', etc., exhibiing left skewness.
* Some of the normally distributed attributes include '14', '76', '90', '103' etc.
* Attribute '114' data has very small variation with majority of the records having value as 0.
* Most of the attributes have outliers notedly in '26', '59', '210', '348' etc.
* Target variable is imbalanced with majority of the records having value entered as -1 (Pass).

In [19]:
# Scatter plot
sns.scatterplot(signal_df, x='180', y='316');

In [20]:
# Scatter plot
sns.scatterplot(signal_df, x='122', y='130', hue='Pass/Fail');

In [21]:
# Heatmap
plt.figure(figsize=(20, 20))
sns.heatmap(signal_df.corr(), vmin=-1, vmax=1);

***Insights***
* Attributes '180' and '316' have strong positive linear relationship.
* A negative relationship can be observed between the attributes '122' and '130'.
* Attributes '59', '103', '210', '348' etc., seems to exhibiting a relatively strong correlation with the target variable.

# Data Pre-processing

In [22]:
# Seprating feature and target variables
X = signal_df.drop('Pass/Fail', axis=1)
Y = signal_df['Pass/Fail']

X.head()

Unnamed: 0,14,21,22,26,28,32,33,38,40,56,...,434,460,468,471,510,511,542,543,551,573
0,7.956,-5419.0,2916.5,1.773,64.233,83.397,9.513,86.956,61.29,0.932,...,10.05,29.939,311.638,9.775,64.671,0.0,0.11,0.008,0.78,0.316
1,10.155,-5441.5,2604.25,2.014,68.422,84.905,9.8,87.524,78.25,0.932,...,8.634,40.447,463.288,10.221,141.436,0.0,0.11,0.008,1.33,0.265
2,9.516,-5447.75,2701.75,2.03,67.133,84.757,8.659,84.733,14.37,0.914,...,14.25,32.359,21.364,8.398,240.777,244.275,0.11,0.008,0.85,0.188
3,9.605,-5468.25,2648.25,2.004,62.933,84.91,8.679,86.687,76.9,0.914,...,5.176,27.682,24.283,9.149,113.559,0.0,0.11,0.008,39.33,0.174
4,10.566,-5476.25,2635.25,1.991,62.833,86.327,8.768,86.147,76.39,0.93,...,11.406,30.892,44.898,7.436,148.066,0.0,0.11,0.008,1.98,0.222


In [23]:
# Piechart
value_counts = Y.value_counts()
pie = plt.pie(value_counts, labels=value_counts.keys(), labeldistance=None, autopct='%.2f%%')
plt.legend(pie[0], ['Pass', 'Fail'], loc='upper right', bbox_to_anchor=(1.5, 1))
plt.title('"Pass/Fail" data distribution');

The target feature data is highly imbalanced with only 6.64% of the records representing failed products and hence oversampling is performed to balance the target classes.

In [24]:
# Over-sampling
smote = SMOTE(random_state=1, k_neighbors=5)
X_bal, Y_bal = smote.fit_resample(X, Y.ravel())
Y_bal = pd.Series(Y_bal)

# Piechart
value_counts = Y_bal.value_counts()
pie = plt.pie(value_counts, labels=value_counts.keys(), labeldistance=None, autopct='%.2f%%')
plt.legend(pie[0], ['Pass', 'Fail'], loc='upper right', bbox_to_anchor=(1.5, 1))
plt.title('"Pass/Fail" data distribution');

In [25]:
# Train-test split
x_train, x_test, y_train, y_test = train_test_split(X_bal, Y_bal, test_size=0.20, random_state=1)

print(f'Proportion of train data: {round(100 * x_train.shape[0] / X_bal.shape[0], 2)}%')
print(f'Proportion of test data: {round(100 * x_test.shape[0] / X_bal.shape[0], 2)}%')
print('-'*50)
print('\nClass wise distribution of train data:')
print(y_train.value_counts(normalize=True) * 100)
print('-'*50)
print('\nClass wise distribution of test data:')
print(y_test.value_counts(normalize=True) * 100)

Proportion of train data: 79.97%
Proportion of test data: 20.03%
--------------------------------------------------

Class wise distribution of train data:
 1   50.299
-1   49.701
Name: proportion, dtype: float64
--------------------------------------------------

Class wise distribution of test data:
-1   51.195
 1   48.805
Name: proportion, dtype: float64


In [26]:
# Standardization
std_scalar = StandardScaler().fit(x_train)
x_train_std = pd.DataFrame(std_scalar.transform(x_train), columns=x_train.columns)
x_test_std = pd.DataFrame(std_scalar.transform(x_test), columns=x_train.columns)

x_train_std.head()

Unnamed: 0,14,21,22,26,28,32,33,38,40,56,...,434,460,468,471,510,511,542,543,551,573
0,-0.093,0.175,0.021,0.339,0.693,0.653,-0.21,-0.281,-0.756,0.009,...,-0.199,-0.75,0.243,0.256,1.34,1.078,-0.126,1.215,-0.152,-0.462
1,-1.082,-1.236,1.126,-0.115,-0.414,-0.362,-0.38,0.233,0.502,0.614,...,-0.182,1.403,-0.468,-0.052,-0.15,-0.935,-0.577,-0.411,-0.649,-1.157
2,1.334,-0.045,-0.251,0.195,2.102,-0.817,-0.515,-0.526,0.65,0.854,...,-0.215,-0.813,-0.058,-0.504,-0.604,-0.935,-0.99,-1.349,-0.156,0.144
3,0.211,-0.127,0.108,0.09,0.197,0.307,-0.213,-0.339,0.455,1.306,...,-0.118,0.417,-0.073,0.418,-0.361,1.378,-0.576,-0.394,0.129,-0.056
4,-1.105,0.026,-0.23,0.265,0.069,-0.374,-0.192,-0.586,0.597,-0.093,...,-0.186,0.858,-0.912,1.166,-0.294,-0.935,0.585,0.163,0.006,0.821


Statistical characteristics: Original data

In [27]:
# 5-point summary
signal_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
14,1567.000,9.005,2.794,2.249,7.097,8.974,10.859,19.547
21,1567.000,-5618.394,626.422,-7150.250,-5932.625,-5523.250,-5356.625,0.000
22,1567.000,2699.378,295.310,0.000,2578.125,2664.000,2840.625,3656.250
26,1567.000,1.938,0.189,0.000,1.907,1.986,2.003,2.053
28,1567.000,69.500,3.459,59.400,67.383,69.156,72.256,77.900
...,...,...,...,...,...,...,...,...
542,1567.000,0.111,0.003,0.105,0.110,0.110,0.113,0.118
543,1567.000,0.008,0.002,0.005,0.008,0.008,0.009,0.024
551,1567.000,1.231,1.243,0.120,0.910,1.231,1.330,39.330
573,1567.000,0.346,0.248,0.067,0.242,0.293,0.367,2.197


Statistical characteristics: Train data

In [28]:
# 5-point summary
x_train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
14,2340.000,8.653,2.584,2.249,6.789,8.633,10.393,19.547
21,2340.000,-5514.459,784.048,-7150.250,-5733.438,-5482.125,-5334.087,0.000
22,2340.000,2660.227,365.924,0.000,2579.692,2651.350,2782.772,3656.250
26,2340.000,1.917,0.257,0.000,1.949,1.987,2.001,2.051
28,2340.000,68.910,3.195,59.400,67.031,68.676,70.878,77.900
...,...,...,...,...,...,...,...,...
511,2340.000,313.453,335.419,0.000,0.000,200.004,635.912,1000.000
542,2340.000,0.111,0.002,0.105,0.110,0.110,0.113,0.118
543,2340.000,0.008,0.001,0.005,0.008,0.008,0.009,0.024
551,2340.000,1.368,1.523,0.120,0.960,1.231,1.360,25.470


Statistical characteristics: Test data

In [29]:
# 5-point summary
x_test.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
14,586.000,8.679,2.511,2.970,6.887,8.725,10.362,18.863
21,586.000,-5464.571,800.878,-6874.242,-5690.625,-5473.750,-5305.906,-1460.897
22,586.000,2647.385,390.706,613.531,2575.202,2635.178,2754.506,3613.750
26,586.000,1.909,0.273,0.439,1.955,1.987,2.003,2.053
28,586.000,68.868,3.344,59.967,66.986,68.679,70.956,77.122
...,...,...,...,...,...,...,...,...
511,586.000,296.799,335.311,0.000,0.000,120.324,640.000,988.235
542,586.000,0.111,0.002,0.106,0.110,0.110,0.113,0.118
543,586.000,0.008,0.001,0.006,0.008,0.008,0.009,0.014
551,586.000,1.450,2.413,0.320,0.955,1.231,1.332,39.330


Statistical characteristics of the original, train and test data appears to be approximately same with little variations.

# Model Training, Testing & Tuning

##### Logistic Regression

In [35]:
# Logistic regression
model = LogisticRegression()
model.fit(x_train_std, y_train)

Train CR: Logistic Regression (Base)

In [34]:
x_test = x_test_std

In [36]:
get_classification_report(actuals=y_train, predictions=model.predict(x_train_std), labels=Y.unique())

Classification report
``````````````````````````````````````````````````
              precision    recall  f1-score   support

          -1       0.85      0.81      0.83      1163
           1       0.82      0.86      0.84      1177

    accuracy                           0.83      2340
   macro avg       0.83      0.83      0.83      2340
weighted avg       0.83      0.83      0.83      2340



In [37]:
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test,model.predict(x_test))

np.float64(0.8060839160839162)

Leave-one-out Cross-validation

In [None]:
loocv = LeaveOneOut()
loocv_score = cross_val_score(LogisticRegression(), x_train_std, y_train, cv=loocv)

print('Accuracy of 10 folds:\n', loocv_score)
print("Overall Accuracy (Standard-deviation): %.3f%% (%.3f%%)" % (loocv_score.mean()*100, loocv_score.std()*100))

K-fold Cross-validation

In [None]:
kcv = KFold(n_splits=10, shuffle=True, random_state=1)
kcv_score = cross_val_score(LogisticRegression(), x_train_std, y_train, cv=kcv)

print('Accuracy of 10 folds:\n', kcv_score)
print("Overall Accuracy (Standard-deviation): %.3f%% (%.3f%%)" % (kcv_score.mean()*100, kcv_score.std()*100))

Stratified K-fold Cross-validation

In [None]:
skcv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
skcv_score = cross_val_score(LogisticRegression(), x_train_std, y_train, cv=skcv)

print('Accuracy of 10 folds:\n', skcv_score)
print("Overall Accuracy (Standard-deviation): %.3f%% (%.3f%%)" % (skcv_score.mean()*100, skcv_score.std()*100))

In [None]:
# Logistic Regression hyper-parameters
lr_params = {"penalty" : ['l1', 'l2', 'elasticnet', None],
             "solver" : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
             "dual": [True, False]}

Grid-search

In [None]:
# Grid Search
grid_search = GridSearchCV(LogisticRegression(), param_grid=lr_params)
grid_search.fit(x_train_std, y_train)
print('Best parameters:', grid_search.best_params_)

In [None]:
# Best estimator
lr_gsbf = grid_search.best_estimator_
lr_gsbf.fit(x_train_std, y_train)

Train CR: Logistic Regression (Grid-Search Best-fit)

In [None]:
get_classification_report(actuals=y_train, predictions=lr_gsbf.predict(x_train_std), labels=Y.unique())

Randomized-search

In [None]:
# Randomized search
random_search = RandomizedSearchCV(LogisticRegression(), param_distributions=lr_params, random_state=1)
random_search.fit(x_train_std, y_train)
print('Best parameters:', random_search.best_params_)

In [None]:
# Best estimator
lr_rsbf = random_search.best_estimator_
lr_rsbf.fit(x_train_std, y_train)

Train CR: Logistic Regression (Randomized-Search Best-fit)

In [None]:
get_classification_report(actuals=y_train, predictions=lr_rsbf.predict(x_train_std), labels=Y.unique())

In [None]:
# Best parameters
lr_best_params = random_search.best_params_

Implementing PCA

In [None]:
# PCA with 40 components
pca40 = PCA(n_components=40)
pca40.fit(x_train_std)

# Cumulative variance explained with PCA components
plt.figure(figsize=(10, 10))
plt.step(list(range(1, 41)), np.cumsum(pca40.explained_variance_ratio_), where='mid')
plt.ylabel('Cumulative variance explained')
plt.xlabel('Number of PCA components')
plt.xticks(list(range(1, 41)));
plt.axhline(0.9, color='r');
plt.axvline(32, color='b');

32 of the derived principal components explain more than 90% of the variance in the data.

In [None]:
# PCA with 32 components explaining more than 90% of the variance
pca32 = PCA(n_components=32)
pca32.fit(x_train_std)

In [None]:
# PCA data
x_train_pca = pca32.transform(x_train_std)
x_test_pca = pca32.transform(x_test_std)
x_train_pca

In [None]:
# Logistic regression
lr_pca = LogisticRegression(**lr_best_params)
lr_pca.fit(x_train_pca, y_train)

Train CR: Logistic Regression (Best-fit with 32 PCs)

In [None]:
get_classification_report(actuals=y_train, predictions=lr_pca.predict(x_train_pca), labels=Y.unique())

***Insights***
* The best-fit logistic regression model, trained on the standardized original 63 attributes, provided an accuracy of 83% with its precision and recall scores also being 83%.
* It accurately predicted 936 out of 1163 passed products and 1010 out of 1177 failed products.
* Through PCA, the number of features were reduced by half to only 32 principal components that explained more than 90% of the variation in the data.
* The best-fit model, trained on the principal components, provided an accuracy of 78% with its precision and recall scores also being 78%.
* By reducing the number of attributes by half, the model only lost about 5% in its accuracy. It accurately predicted 887 out of 1163 passed products and 934 out of 1177 failed products.

Test CR: Logistic Regression (Best-fit with 32 PCs)

In [None]:
get_classification_report(actuals=y_test, predictions=lr_pca.predict(x_test_pca), labels=Y.unique())

***Insights***
* The model also performed considerably well with the test data with similar scores as with the train data indicating that the model is not overfit.
* It has provided an accuracy of 76% with average precision and recall scores being 76%.
* It has accurately predicted 221 out of 300 passed products and 223 out of 286 failed products.

In [None]:
def find_best_estimator(model, param_distributions, x, y):
    """
    Function to find the best estimator through Randomized-Search technique.
    """    
    randomized_search = RandomizedSearchCV(model, param_distributions=param_distributions, random_state=1, n_jobs=4, verbose=10)
    randomized_search.fit(x, y)
    print('Randomized-Search', '-'*50)
    print('Best score:', randomized_search.best_score_)
    print('Best parameters:', randomized_search.best_params_)
    
    return randomized_search.best_estimator_, randomized_search.best_params_

##### Support Vector Classifier

In [None]:
# SVC
svc = SVC()
svc.fit(x_train_pca, y_train)
get_classification_report(actuals=y_train, predictions=svc.predict(x_train_pca), labels=Y.unique())

In [None]:
# Hyper-parameter tuning
svc_rsbf, svc_best_params = find_best_estimator(model=SVC(), 
                                                param_distributions={'C': [10, 100],
                                                                     'gamma': [0.1, 0.05], 
                                                                     'kernel': ['rbf', 'linear']},
                                                x=x_train_pca,
                                                y=y_train)

##### K-Neighbors Classifier

In [None]:
# KNeighborsClassifier
knc = KNeighborsClassifier()
knc.fit(x_train_pca, y_train)
get_classification_report(actuals=y_train, predictions=knc.predict(x_train_pca), labels=Y.unique())

In [None]:
# Hyper-parameter tuning
knc_rsbf, knc_best_params = find_best_estimator(model=KNeighborsClassifier(),
                                                param_distributions={'n_neighbors': list(range(1, 50, 2)),
                                                                     'leaf_size': list(range(1, 50, 5)), 
                                                                     'weights': ['distance']}, 
                                                x=x_train_pca, 
                                                y=y_train)

##### GaussianNB

In [None]:
# GaussianNB
gnb = GaussianNB()
gnb.fit(x_train_pca, y_train)
get_classification_report(actuals=y_train, predictions=gnb.predict(x_train_pca), labels=Y.unique())

In [None]:
# Hyper-parameter tuning
gnb_rsbf, gnb_best_params = find_best_estimator(model=GaussianNB(),
                                                param_distributions={'var_smoothing': np.logspace(0,-9, num=20)}, 
                                                x=x_train_pca, 
                                                y=y_train)

##### Decision Tree Classifier

In [None]:
# DecisionTreeClassifier
dtc = DecisionTreeClassifier(max_depth=3, min_samples_leaf=5, random_state=1)
dtc.fit(x_train_pca, y_train)
get_classification_report(actuals=y_train, predictions=dtc.predict(x_train_pca), labels=Y.unique())

In [None]:
# Hyper-parameter tuning
dtc_rsbf, dtc_best_params = find_best_estimator(model=DecisionTreeClassifier(),
                                                param_distributions={'criterion': ['gini', 'entropy', 'log_loss'], 
                                                                     'splitter': ['best', 'random'], 
                                                                     'max_depth': list(range(3, 21)),
                                                                     'min_samples_leaf': list(range(3, 21))}, 
                                                x=x_train_pca, 
                                                y=y_train)

##### Bagging Classifier

In [None]:
# BaggingClassifier
bc = BaggingClassifier(random_state=1)
bc.fit(x_train_pca, y_train)
get_classification_report(actuals=y_train, predictions=bc.predict(x_train_pca), labels=Y.unique())

In [None]:
# Hyper-parameter tuning
bc_rsbf, bc_best_params = find_best_estimator(model=BaggingClassifier(random_state=1),
                                              param_distributions={'n_estimators': list(range(5, 50, 5)),
                                                                   'max_features': list(range(4, 33, 4)), 
                                                                   'bootstrap_features': [True, False]}, 
                                              x=x_train_pca, 
                                              y=y_train)

##### Ada-Boost Classifier

In [None]:
# AdaBoostClassifier
abc = AdaBoostClassifier(random_state=1)
abc.fit(x_train_pca, y_train)
get_classification_report(actuals=y_train, predictions=abc.predict(x_train_pca), labels=Y.unique())

In [None]:
# Hyper-parameter tuning
abc_rsbf, abc_best_params = find_best_estimator(model=AdaBoostClassifier(random_state=1),
                                                param_distributions={'n_estimators': list(range(5, 50, 5))},
                                                x=x_train_pca,
                                                y=y_train)

##### Gradient Boosting Classifier

In [None]:
# GradientBoostingClassifier
gbc = GradientBoostingClassifier(random_state=1)
gbc.fit(x_train_pca, y_train)
get_classification_report(actuals=y_train, predictions=gbc.predict(x_train_pca), labels=Y.unique())

In [None]:
# Hyper-parameter tuning
gbc_rsbf, gbc_best_params = find_best_estimator(model=GradientBoostingClassifier(random_state=1),
                                                param_distributions={'loss': ['log_loss', 'exponential'],
                                                                     'n_estimators': list(range(5, 50, 5)),
                                                                     'max_depth': list(range(3, 21))},
                                                x=x_train_pca,
                                                y=y_train)

##### Random Forest Classifier

In [None]:
# RandomForestClassifier
rfc = RandomForestClassifier(max_depth=3, min_samples_leaf=5, random_state=1)
rfc.fit(x_train_pca, y_train)
get_classification_report(actuals=y_train, predictions=rfc.predict(x_train_pca), labels=Y.unique())

In [None]:
# Hyper-parameter tuning
rfc_rsbf, rfc_best_params = find_best_estimator(model=RandomForestClassifier(random_state=1),
                                                param_distributions={'criterion': ['gini', 'entropy', 'log_loss'],
                                                                     'n_estimators': list(range(10, 100, 10)),
                                                                     'max_depth': list(range(3, 21))},
                                                x=x_train_pca,
                                                y=y_train)

In [None]:
def get_model_scores(x, y, cv=False):
    """
    Function to compute model metric scores.
    """
    model_scores = list()
    kfcv = KFold(n_splits=10, shuffle=True, random_state=1) if cv else None
    for model in [lr_pca, svc_rsbf, knc_rsbf, gnb_rsbf, dtc_rsbf, bc_rsbf, abc_rsbf, gbc_rsbf, rfc_rsbf]:
        cr = metrics.classification_report(y, model.predict(x), labels=Y.unique(), output_dict=True)
        scores = {
            'model': model.__class__.__name__,
            'accuracy': cr['accuracy'],
            'precision': cr['weighted avg']['precision'],
            'recall': cr['weighted avg']['recall'],
            'f1_score': cr['weighted avg']['f1-score'],
        }
        
        if kfcv:
            kfcv_score = cross_val_score(model, x, y, cv=kfcv)
            scores.update({
                'cv_mean': kfcv_score.mean(),
                'cv_std': kfcv_score.std()
            })
        
        model_scores.append(scores)
        
    return pd.DataFrame(model_scores)

In [None]:
print('Training performance')
get_model_scores(x=x_train_pca, y=y_train, cv=True)

In [None]:
print('Testing performance')
get_model_scores(x=x_test_pca, y=y_test)

In [None]:
# Pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=32)),
    ('svc', SVC(**svc_best_params))
])
pipeline.fit(x_train, y_train)

Test CR: SVC (Best-fit with 32 PCs)

In [None]:
get_classification_report(actuals=y_test, predictions=pipeline.predict(x_test), labels=Y.unique())

The best performing model is found be the Support Vector Classifier model with the hyper-parameter values: kernel=rbf; gamma=0.1; C=10;
* The model has the highest cross-validated mean accuracy of 98.9% with the lowest standard deviation of 0.5%.
* It also has performed exceptionally well on the testing data with an accuracy score of 98.6%.

In [None]:
# Save model
with open("2024-01-21_model_svc_signaldata.pkl", "wb") as f:
    pickle.dump(pipeline, f)

##### Conclusion
* The semiconductor signal dataset contains 590 unlabelled numeric features along with a datetime feature and a categorical target attribute describing if a product failed the quality check or not.
* Since the features are unlabelled and denoted by index values, it was not possible to determine their type or unit of measurements.
* Based on statistical information and feature reduction techniques, the number of predictor variables were reduced to 63 which were further used in PCA to derive 32 principal components that described over 90% of the variation in the data.
* The resultant components were used to train multiple machine learning models to find the best performing algorithm that could predict if a product would fail the quality check.
* By reducing the number of features and extracting the most influential information from the data through PCA, we were able to build an efficient model which is computationally less complex.
* Hyper-parameter tuning further helped to enhance the model performance and compensate for the discarded information.