In [13]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

In [14]:
wine = load_wine()
X = pd.DataFrame(wine.data, columns=wine.feature_names)
y = pd.Series(wine.target)

In [15]:
X.iloc[0:3, 2] = np.nan

In [16]:
print("Original data with missing values:")
print(X.head())

Original data with missing values:
   alcohol  malic_acid   ash  alcalinity_of_ash  magnesium  total_phenols  \
0    14.23        1.71   NaN               15.6      127.0           2.80   
1    13.20        1.78   NaN               11.2      100.0           2.65   
2    13.16        2.36   NaN               18.6      101.0           2.80   
3    14.37        1.95  2.50               16.8      113.0           3.85   
4    13.24        2.59  2.87               21.0      118.0           2.80   

   flavanoids  nonflavanoid_phenols  proanthocyanins  color_intensity   hue  \
0        3.06                  0.28             2.29             5.64  1.04   
1        2.76                  0.26             1.28             4.38  1.05   
2        3.24                  0.30             2.81             5.68  1.03   
3        3.49                  0.24             2.18             7.80  0.86   
4        2.69                  0.39             1.82             4.32  1.04   

   od280/od315_of_diluted_w

In [17]:
imputer = SimpleImputer(strategy='median')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

In [18]:
print("\nData after handling missing values (median imputation):")
print(X_imputed.head())


Data after handling missing values (median imputation):
   alcohol  malic_acid   ash  alcalinity_of_ash  magnesium  total_phenols  \
0    14.23        1.71  2.36               15.6      127.0           2.80   
1    13.20        1.78  2.36               11.2      100.0           2.65   
2    13.16        2.36  2.36               18.6      101.0           2.80   
3    14.37        1.95  2.50               16.8      113.0           3.85   
4    13.24        2.59  2.87               21.0      118.0           2.80   

   flavanoids  nonflavanoid_phenols  proanthocyanins  color_intensity   hue  \
0        3.06                  0.28             2.29             5.64  1.04   
1        2.76                  0.26             1.28             4.38  1.05   
2        3.24                  0.30             2.81             5.68  1.03   
3        3.49                  0.24             2.18             7.80  0.86   
4        2.69                  0.39             1.82             4.32  1.04   

   od

In [19]:
y_categorical = y.map({0: 'class_0', 1: 'class_1', 2: 'class_2'})
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_categorical)

In [20]:
print("\nEncoded target values:")
print(y_encoded[:5])


Encoded target values:
[0 0 0 0 0]


In [21]:
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X_imputed), columns=X.columns)

In [22]:
print("\nScaled feature values (first 5 rows):")
print(X_scaled.head())


Scaled feature values (first 5 rows):
    alcohol  malic_acid       ash  alcalinity_of_ash  magnesium  \
0  1.518613   -0.562250 -0.020650          -1.169593   1.913905   
1  0.246290   -0.499413 -0.020650          -2.490847   0.018145   
2  0.196879    0.021231 -0.020650          -0.268738   0.088358   
3  1.691550   -0.346811  0.493953          -0.809251   0.930918   
4  0.295700    0.227694  1.853978           0.451946   1.281985   

   total_phenols  flavanoids  nonflavanoid_phenols  proanthocyanins  \
0       0.808997    1.034819             -0.659563         1.224884   
1       0.568648    0.733629             -0.820719        -0.544721   
2       0.808997    1.215533             -0.498407         2.135968   
3       2.491446    1.466525             -0.981875         1.032155   
4       0.808997    0.663351              0.226796         0.401404   

   color_intensity       hue  od280/od315_of_diluted_wines   proline  
0         0.251717  0.362177                      1.847920  

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.25, random_state=42)

print("\nShape of train and test sets:")
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}, y_test: {y_test.shape}")


Shape of train and test sets:
X_train: (133, 13), X_test: (45, 13)
y_train: (133,), y_test: (45,)
