In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler


# another way of ignoring warnings
warnings.filterwarnings('ignore')

In [2]:
### Importing data
df = pd.read_csv("case1Data.csv")
y = df.iloc[:, 0]
X = df.iloc[:, 1:]
df.head(10)

Unnamed: 0,y,x_01,x_02,x_03,x_04,x_05,x_06,x_07,x_08,x_09,...,x_91,x_92,x_93,x_94,x_95,C_01,C_02,C_03,C_04,C_05
0,375.823073,6.359019,-13.36712,-2.48375,-6.641891,11.733539,,-17.085361,22.194764,16.827888,...,-10.200888,3.980048,-4.433274,-1.473723,,74.0,72.0,72.0,73.0,73.0
1,266.81173,3.873664,-8.470389,-3.055012,,11.420983,1.82233,-13.6941,22.738654,20.307503,...,-9.740207,,-2.629314,4.816987,-12.240248,74.0,72.0,72.0,73.0,73.0
2,267.271759,5.275824,-12.070531,-1.366168,-4.8191,10.721527,-5.125992,-17.476865,,15.963889,...,-14.50197,10.054005,,,-11.107921,73.0,72.0,75.0,74.0,74.0
3,219.951294,4.43011,-4.467975,-0.730736,-10.047104,11.498539,-2.87026,-14.033012,18.22519,10.409488,...,-11.086963,2.019726,-8.531959,3.520833,,71.0,72.0,73.0,71.0,72.0
4,289.697954,3.116458,-8.518713,-6.79605,,7.646285,-3.118309,-13.102567,22.801217,16.680208,...,-9.117422,6.627601,-2.805531,5.914351,-11.240573,72.0,72.0,72.0,74.0,75.0
5,265.753204,2.478883,-10.347278,-3.574333,-4.320143,,-3.326662,-13.197508,23.424267,12.551075,...,-11.571283,7.487204,-5.098366,3.175914,-9.610356,71.0,72.0,74.0,71.0,75.0
6,133.594186,0.497803,-9.389992,-7.230404,-8.852222,12.308068,-5.28838,-9.998207,23.22943,13.491712,...,-15.720359,,-6.235942,4.124001,-16.268568,74.0,72.0,75.0,71.0,75.0
7,252.251096,7.627311,-7.14666,0.244472,0.182987,,-0.395215,-14.423119,,,...,-10.710254,7.884195,-2.402023,,-4.631043,74.0,72.0,73.0,75.0,73.0
8,163.402815,4.981034,-8.719028,-5.484942,-5.919929,9.916674,-4.726807,-10.606375,24.903043,12.151393,...,-11.139651,7.09643,-2.865486,7.464922,-8.441735,,,,,
9,282.511836,0.519655,-14.477831,-2.121282,,12.080162,-6.393721,-11.599432,,11.984299,...,-14.195198,2.409425,-3.523658,2.821658,-14.591262,75.0,72.0,72.0,74.0,71.0


### Dealing with NaN values
#### Instead of filling NaN values with mean of the predictor, use KNN Imputer (Scikit-Learn).
This imputer utilizes the k-Nearest Neighbors method to replace the missing values in the datasets with the mean value from the parameter ‘n_neighbors’ nearest neighbors found in the training set. By default, it uses a Euclidean distance metric to impute the missing values.

Another critical point here is that the KNN Imptuer is a distance-based imputation method and it requires us to normalize our data. Otherwise, the different scales of our data will lead the KNN Imputer to generate biased replacements for the missing values. For simplicity, we will use Scikit-Learn’s MinMaxScaler which will scale our variables to have values between 0 and 1.

We are setting the parameter ‘n_neighbors’ as 5. So, the missing values will be replaced by the mean value of 5 nearest neighbors measured by Euclidean distance.

```python
scaler = MinMaxScaler()
df_ = pd.DataFrame(scaler.fit_transform(df), columns = df.columns)

imputer = KNNImputer(n_neighbors=5)
df_ = pd.DataFrame(imputer.fit_transform(df),columns = df.columns)
df.head(10)
```

#### pypeline:
1. spliting the data
```python
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=1)
```
2. for continuous variables (X_cont) StandardScaler() followed by KNNImputer(n_neighbors=5) 
```python
# starndartize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# apply KNNImputer
imputer = KNNImputer(n_neighbors=5) 
X_train_imputed = imputer.fit_transform(X_train_scaled)
X_test_imputed = imputer.transform(X_test_scaled)
```
3. for categorical variables (X_cat) SimpleImputer(strategy="most_frequent") followed by 1-hot-encoding
```python
# impute categorical values using SimpleImputer
cat_imputer = SimpleImputer(strategy="most_frequent")
X_train_imputed = cat_imputer.fit_transform(X_train)
X_test_imputed = cat_imputer.transform(X_test)
```

In [3]:
# Splitting the data
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=1)

# train set
Xtrain_cont = Xtrain.iloc[:, :95]
Xtrain_cat = Xtrain.iloc[:, 95:]

# test set
Xtest_cont = Xtest.iloc[:, :95]
Xtest_cat = Xtest.iloc[:, 95:]

# --- Scaling Continuous Variables ---
standard_scaler = StandardScaler()
Xtrain_cont_scaled = standard_scaler.fit_transform(Xtrain_cont)
Xtest_cont_scaled = standard_scaler.transform(Xtest_cont)

# --- KNN Imputation for Continuous Variables ---
knn_imput = KNNImputer(n_neighbors=5)
X_train_cont_imputed = knn_imput.fit_transform(Xtrain_cont_scaled)
X_test_cont_imputed = knn_imput.transform(Xtest_cont_scaled)

# Convert to DataFrame 
X_train_cont_imputed = pd.DataFrame(X_train_cont_imputed, columns=Xtrain_cont.columns, index=Xtrain_cont.index)
X_test_cont_imputed = pd.DataFrame(X_test_cont_imputed, columns=Xtest_cont.columns, index=Xtest_cont.index)

# --- Imputation for Categorical Variables ---
simple_imput = SimpleImputer(strategy="most_frequent")
Xtrain_cat_imputed = simple_imput.fit_transform(Xtrain_cat)
Xtest_cat_imputed = simple_imput.transform(Xtest_cat)

# Convert to DataFrame 
Xtrain_cat_imputed = pd.DataFrame(Xtrain_cat_imputed, columns=Xtrain_cat.columns, index=Xtrain_cat.index)
Xtest_cat_imputed = pd.DataFrame(Xtest_cat_imputed, columns=Xtest_cat.columns, index=Xtest_cat.index)

# --- 1 HOT encoding ---
Xtrain_cat_imputed1HOT = pd.get_dummies(Xtrain_cat_imputed, columns=Xtrain_cat_imputed.columns, drop_first=False).astype(int)
Xtest_cat_imputed1HOT = pd.get_dummies(Xtest_cat_imputed, columns=Xtrain_cat_imputed.columns, drop_first=False).astype(int)
# Ensure that both train and test have the same columns
Xtest_cat_imputed1HOT = Xtest_cat_imputed1HOT.reindex(columns=Xtrain_cat_imputed1HOT.columns, fill_value=0)

# --- Align the target variable with the feature DataFrames ---
# --- for Regression models ---
# --- Concatenate Continuous and Categorical Data ---
Xtrain_final = pd.concat([X_train_cont_imputed, Xtrain_cat_imputed1HOT], axis=1)
Xtest_final = pd.concat([X_test_cont_imputed, Xtest_cat_imputed1HOT], axis=1)

df_train_regression = pd.concat([ytrain, Xtrain_final], axis=1)
df_test_regression = pd.concat([ytest, Xtest_final], axis=1)

# --- for Trees ---
# --- Concatenate Continuous and Categorical Data ---
Xtrain_final = pd.concat([X_train_cont_imputed, Xtrain_cat_imputed], axis=1)
Xtest_final = pd.concat([X_test_cont_imputed, Xtest_cat_imputed], axis=1)

df_train_tree = pd.concat([ytrain, Xtrain_final], axis=1)
df_test_tree = pd.concat([ytest, Xtest_final], axis=1)

Save to seperate dataframes: Xytrain.csv and Xytest.csv

In [5]:
df_train_regression.to_csv("Xytrain_regression.csv")
df_test_regression.to_csv("Xytest_regression.csv")

df_train_tree.to_csv("Xytrain_tree.csv")
df_test_tree.to_csv("Xytest_tree.csv")