In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
file_path = '/content/drive/MyDrive/Cylinder_banding_dataset.csv'
df = pd.read_csv(file_path)

In [4]:
print("Dataset shape:", df.shape)
print("\nFirst 5 rows of the dataset:")
print(df.head())
df.info()
# Check for missing values
print(df.isnull().sum())

Dataset shape: (541, 37)

First 5 rows of the dataset:
  Timestamp Cylinder_Number Customer Job_Number Ink_Color Blade_mfg  \
0  19910108            X126  TVGUIDE      25503       KEY    BENTON   
1  19910109            X266  TVGUIDE      25503       KEY    BENTON   
2  19910104              B7   MODMAT      47201       KEY    BENTON   
3  19910104            T133   MASSEY      39039       KEY    BENTON   
4  19910111             J34    KMART      37351       KEY    BENTON   

  Cylinder_Division Paper_Type  Ink_Type Press  ...   Wax Hardener Voltage  \
0          GALLATIN   UNCOATED  UNCOATED    NO  ...  50.5     36.4       0   
1          GALLATIN   UNCOATED  UNCOATED    NO  ...  54.9     38.5       0   
2          GALLATIN   UNCOATED    COATED    NO  ...  53.8     39.8       0   
3          GALLATIN   UNCOATED  UNCOATED    NO  ...  55.6     38.8       0   
4          GALLATIN   UNCOATED    COATED    NO  ...  57.5     42.5       5   

  Wax.1  Hardner.1 Blade_Pressure.1 Current_Densi

In [5]:
df = df.replace('?', np.nan)

In [6]:
categorical_columns = ['Cylinder_Number','Customer','Job_Number','Ink_Color','Blade_mfg','Cylinder_Division','Paper_Type','Ink_Type','Press','Unit_Number','Cylinder_Size','Press_Type','Press.1','Cylinder_Size.1','Paper_Mill_Location']
numerical_columns = ['Unit_Number.1','Humidity','Varnish_pct','Caliper','Ink_Temperature','Roller_Durometer','Hardner','Blade_Pressure','ESA_Voltage','ESA_Amperage','Wax','Hardener','Voltage', 'Wax.1','Hardner.1','Blade_Pressure.1','Current_Density','Anode_space_ratio','Chrome_content']

In [7]:
for col in df.columns:
    if col not in categorical_columns + ['Band_Type']:
        df[col] = pd.to_numeric(df[col], errors='coerce')

In [8]:
for col in df.columns:
    if col in categorical_columns:
        df[col] = df[col].fillna(df[col].mode()[0])
    elif col != 'Band_Type':
        df[col] = df[col].astype(float)
        df[col] = df[col].fillna(df[col].mean())

In [9]:
df.dropna(inplace=True)
df.head()

Unnamed: 0,Timestamp,Cylinder_Number,Customer,Job_Number,Ink_Color,Blade_mfg,Cylinder_Division,Paper_Type,Ink_Type,Press,...,Wax,Hardener,Voltage,Wax.1,Hardner.1,Blade_Pressure.1,Current_Density,Anode_space_ratio,Chrome_content,Band_Type
0,19910108.0,X126,TVGUIDE,25503,KEY,BENTON,GALLATIN,UNCOATED,UNCOATED,NO,...,50.5,36.4,0.0,2.5,1.0,34.0,40.0,105.0,100.0,band
1,19910109.0,X266,TVGUIDE,25503,KEY,BENTON,GALLATIN,UNCOATED,UNCOATED,NO,...,54.9,38.5,0.0,2.5,0.7,34.0,40.0,105.0,100.0,noband
2,19910104.0,B7,MODMAT,47201,KEY,BENTON,GALLATIN,UNCOATED,COATED,NO,...,53.8,39.8,0.0,2.8,0.9,40.0,40.0,103.87,100.0,noband
3,19910104.0,T133,MASSEY,39039,KEY,BENTON,GALLATIN,UNCOATED,UNCOATED,NO,...,55.6,38.8,0.0,2.5,1.3,40.0,40.0,108.06,100.0,noband
4,19910111.0,J34,KMART,37351,KEY,BENTON,GALLATIN,UNCOATED,COATED,NO,...,57.5,42.5,5.0,2.3,0.6,35.0,40.0,106.67,100.0,noband


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 539 entries, 0 to 540
Data columns (total 37 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Timestamp            539 non-null    float64
 1   Cylinder_Number      539 non-null    object 
 2   Customer             539 non-null    object 
 3   Job_Number           539 non-null    object 
 4   Ink_Color            539 non-null    object 
 5   Blade_mfg            539 non-null    object 
 6   Cylinder_Division    539 non-null    object 
 7   Paper_Type           539 non-null    object 
 8   Ink_Type             539 non-null    object 
 9   Press                539 non-null    object 
 10  Unit_Number          539 non-null    object 
 11  Cylinder_Size        539 non-null    object 
 12  Press_Type           539 non-null    object 
 13  Press.1              539 non-null    object 
 14  Unit_Number.1        539 non-null    float64
 15  Cylinder_Size.1      539 non-null    object 


In [11]:
le = LabelEncoder()
for col in categorical_columns:
    df[col] = le.fit_transform(df[col].astype(str))

In [12]:
df.head()

Unnamed: 0,Timestamp,Cylinder_Number,Customer,Job_Number,Ink_Color,Blade_mfg,Cylinder_Division,Paper_Type,Ink_Type,Press,...,Wax,Hardener,Voltage,Wax.1,Hardner.1,Blade_Pressure.1,Current_Density,Anode_space_ratio,Chrome_content,Band_Type
0,19910108.0,265,60,24,0,0,0,1,2,0,...,50.5,36.4,0.0,2.5,1.0,34.0,40.0,105.0,100.0,band
1,19910109.0,305,60,24,0,0,0,1,2,0,...,54.9,38.5,0.0,2.5,0.7,34.0,40.0,105.0,100.0,noband
2,19910104.0,10,48,244,0,0,0,1,0,0,...,53.8,39.8,0.0,2.8,0.9,40.0,40.0,103.87,100.0,noband
3,19910104.0,221,46,237,0,0,0,1,2,0,...,55.6,38.8,0.0,2.5,1.3,40.0,40.0,108.06,100.0,noband
4,19910111.0,153,43,194,0,0,0,1,0,0,...,57.5,42.5,5.0,2.3,0.6,35.0,40.0,106.67,100.0,noband


In [13]:
df['Band_Type'] = df['Band_Type'].str.lower().map({'band': 1, 'noband': 0})

# Classification

In [14]:
X = df.drop('Band_Type', axis=1)
y = df['Band_Type']

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [17]:
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)

print("Decision Tree Classification Report:\n", classification_report(y_test, y_pred_dt))

Decision Tree Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.67      0.75        61
           1       0.66      0.83      0.74        47

    accuracy                           0.74       108
   macro avg       0.75      0.75      0.74       108
weighted avg       0.76      0.74      0.74       108



In [18]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
print("Random Forest Classification Report:\n", classification_report(y_test, y_pred_rf))

Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.97      0.89        61
           1       0.94      0.72      0.82        47

    accuracy                           0.86       108
   macro avg       0.88      0.85      0.85       108
weighted avg       0.87      0.86      0.86       108



In [19]:
svm_model = SVC(kernel='rbf', probability=True, random_state=42)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

print("SVM Classification Report:\n", classification_report(y_test, y_pred_svm))

SVM Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.84      0.81        61
           1       0.77      0.70      0.73        47

    accuracy                           0.78       108
   macro avg       0.78      0.77      0.77       108
weighted avg       0.78      0.78      0.78       108



In [20]:
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

print("XGBoost Classification Report:\n", classification_report(y_test, y_pred_xgb))

Parameters: { "use_label_encoder" } are not used.



XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.92      0.89        61
           1       0.88      0.81      0.84        47

    accuracy                           0.87       108
   macro avg       0.87      0.86      0.87       108
weighted avg       0.87      0.87      0.87       108



# Regression

In [21]:
print(df.head())
df.info()

    Timestamp  Cylinder_Number  Customer  Job_Number  Ink_Color  Blade_mfg  \
0  19910108.0              265        60          24          0          0   
1  19910109.0              305        60          24          0          0   
2  19910104.0               10        48         244          0          0   
3  19910104.0              221        46         237          0          0   
4  19910111.0              153        43         194          0          0   

   Cylinder_Division  Paper_Type  Ink_Type  Press  ...   Wax  Hardener  \
0                  0           1         2      0  ...  50.5      36.4   
1                  0           1         2      0  ...  54.9      38.5   
2                  0           1         0      0  ...  53.8      39.8   
3                  0           1         2      0  ...  55.6      38.8   
4                  0           1         0      0  ...  57.5      42.5   

   Voltage  Wax.1  Hardner.1  Blade_Pressure.1  Current_Density  \
0      0.0    2.5  

In [22]:
# Correlation with Humidity
corr = df.corr(numeric_only=True)
print(corr['Humidity'].sort_values(ascending=False))

Humidity               1.000000
ESA_Voltage            0.361854
Blade_Pressure.1       0.344740
Ink_Type               0.340303
Paper_Type             0.269706
Paper_Mill_Location    0.263073
Wax.1                  0.161902
Blade_mfg              0.100362
Blade_Pressure         0.096495
Job_Number             0.095871
Unit_Number            0.089090
Caliper                0.071879
Band_Type              0.043056
Customer               0.031412
Varnish_pct            0.024660
Roller_Durometer       0.007069
Ink_Color              0.005931
Cylinder_Division      0.003757
Ink_Temperature       -0.004169
Press                 -0.004863
Chrome_content        -0.014311
Anode_space_ratio     -0.033402
Hardner.1             -0.046371
Unit_Number.1         -0.047479
Plating_Tank          -0.049739
Cylinder_Size         -0.053896
Current_Density       -0.059656
Hardner               -0.066636
Timestamp             -0.078833
Voltage               -0.102666
Press_Type            -0.119917
Cylinder

In [23]:
# Remove Band_Type for regression
X_reg = df.drop(['Humidity', 'Band_Type'], axis=1)
y_reg = df['Humidity']

# Scale features for regression
scaler_reg = StandardScaler()
X_reg_scaled = scaler_reg.fit_transform(X_reg)

X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg_scaled, y_reg, test_size=0.2, random_state=42
)

In [24]:
mlr = LinearRegression()
mlr.fit(X_train_reg, y_train_reg)
y_pred_reg = mlr.predict(X_test_reg)

mse = mean_squared_error(y_test_reg, y_pred_reg)
r2 = r2_score(y_test_reg, y_pred_reg)

print(f"MLR Mean Squared Error: {mse:.2f}")
print(f"MLR R^2 Score: {r2:.2f}")

MLR Mean Squared Error: 61.33
MLR R^2 Score: 0.22
