# Task for Today  

***

## Semiconductor Test Result Prediction  
  
Given *data about semiconductors*, let's try to predict whether a given semiconductor will **pass or fail** a QA test.  
  
We will use a logistic regression model to make our predictions.

# Getting Started

In [None]:
import numpy as np
import pandas as pd

import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler, SMOTE

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix, classification_report

import warnings
warnings.filterwarnings(action='ignore')

In [None]:
data = pd.read_csv('../input/uci-semcom/uci-secom.csv')

In [None]:
data

In [None]:
data.info()

# Preprocessing

In [None]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop Time
    df = df.drop('Time', axis=1)
    
    # Drop columns with more than 25% missing values
    missing_value_columns = df.columns[df.isna().mean() >= 0.25]
    df = df.drop(missing_value_columns, axis=1)
    
    # Fill remaining missing values
    for column in df.columns:
        df[column] = df[column].fillna(df[column].mean())
    
    # Remove columns with only one value
    single_value_columns = [
        '5', '13', '42', '49', '52', '69', '97', '141', '149', '178', '179', '186', '189', '190',
        '191', '192', '193', '194', '226', '229', '230', '231', '232', '233', '234', '235', '236',
        '237', '240', '241', '242', '243', '256', '257', '258', '259', '260', '261', '262', '263',
        '264', '265', '266', '276', '284', '313', '314', '315', '322', '325', '326', '327', '328',
        '329', '330', '364', '369', '370', '371', '372', '373', '374', '375', '378', '379', '380',
        '381', '394', '395', '396', '397', '398', '399', '400', '401', '402', '403', '404', '414',
        '422', '449', '450', '451', '458', '461', '462', '463', '464', '465', '466', '481', '498',
        '501', '502', '503', '504', '505', '506', '507', '508', '509', '512', '513', '514', '515',
        '528', '529', '530', '531', '532', '533', '534', '535', '536', '537', '538'
    ]
    df = df.drop(single_value_columns, axis=1)
    
    # Give text labels to the training examples
    df['Pass/Fail'] = df['Pass/Fail'].replace({-1: "PASS", 1: "FAIL"})
    
    # Split df into X and y
    y = df['Pass/Fail']
    X = df.drop('Pass/Fail', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)

    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [None]:
X_train

In [None]:
y_train

# Examining Class Imbalance

In [None]:
y_train.value_counts()

In [None]:
fig = px.pie(
    y_train.value_counts(),
    values='Pass/Fail',
    names=["PASS", "FAIL"],
    title="Class Distribution",
    width=500
)

fig.show()

# Training a Model (Imbalanced Classes)

In [None]:
def evaluate_model(model, X_test, y_test):
    
    acc = model.score(X_test, y_test)
    print("Accuracy: {:.2f}%".format(acc * 100))
    
    y_pred = model.predict(X_test)
    
    cm = confusion_matrix(y_test, y_pred, labels=['PASS', 'FAIL'])
    clr = classification_report(y_test, y_pred, labels=['PASS', 'FAIL'])
    
    plt.figure(figsize=(6, 6))
    sns.heatmap(cm, annot=True, fmt='g', vmin=0, cmap='Blues', cbar=False)
    plt.xticks(ticks=[0.5, 1.5], labels=["PASS", "FAIL"])
    plt.yticks(ticks=[0.5, 1.5], labels=["PASS", "FAIL"])
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.show()
    
    print("Classification Report:\n----------------------\n", clr)

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

evaluate_model(model, X_test, y_test)

# Random Oversampling

In [None]:
oversampler = RandomOverSampler(random_state=1)
X_train_os, y_train_os = oversampler.fit_resample(X_train, y_train)

In [None]:
y_train.value_counts()

In [None]:
fig = px.pie(
    y_train_os.value_counts(),
    values='Pass/Fail',
    names=["PASS", "FAIL"],
    title="Class Distribution",
    width=500
)

fig.show()

In [None]:
model = LogisticRegression()
model.fit(X_train_os, y_train_os)

evaluate_model(model, X_test, y_test)

# Oversampling With SMOTE

In [None]:
oversampler = SMOTE(random_state=1)
X_train_smote, y_train_smote = oversampler.fit_resample(X_train, y_train)

In [None]:
y_train_smote.value_counts()

In [None]:
model = LogisticRegression()
model.fit(X_train_smote, y_train_smote)

evaluate_model(model, X_test, y_test)

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/XzmLJasJhS4