# Data Preprocessing

<img src="https://cdn-blog.scalablepath.com/uploads/2021/11/900-x-615-1.png" width=550>

# 1. Data preparation on the test data set:

### 1.1. Import Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

### 1.2. Creating a test data set

In [2]:
data = pd.DataFrame(
    {
        "Region": ["India", "Brazil", "USA", "Brazil", "USA", "India", "Brazil", "India", "USA", "India"],
        "Age": [49, 32, 35, 43, 45, 40, np.nan, 53, 55, 42],
        "Income": [86400, 57600, 64800, 73200, np.nan, 69600, 62400, 94800, 99600, 80400],
        "Online Shopper": ["No", "Yes", "No", "No", "Yes", "Yes", "No", "Yes", "No", "Yes"]
    }
)

In [3]:
data

Unnamed: 0,Region,Age,Income,Online Shopper
0,India,49.0,86400.0,No
1,Brazil,32.0,57600.0,Yes
2,USA,35.0,64800.0,No
3,Brazil,43.0,73200.0,No
4,USA,45.0,,Yes
5,India,40.0,69600.0,Yes
6,Brazil,,62400.0,No
7,India,53.0,94800.0,Yes
8,USA,55.0,99600.0,No
9,India,42.0,80400.0,Yes


### 1.3. separating dependent and independent variables

In [4]:
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

### 1.4. Handling Missing Values

In [5]:
si = SimpleImputer(missing_values=np.nan, strategy='mean')
si.fit(X[:, 1:])
X[:, 1:] = si.transform(X[:, 1:])

### 1.5. Encoding Categorical Values

In [6]:
le = LabelEncoder()
X[:, 0] = le.fit_transform(X[:, 0])

In [7]:
ohe = OneHotEncoder()
ct = ColumnTransformer(
    [('country', OneHotEncoder(), [0])], remainder='passthrough'
)

In [8]:
X = ct.fit_transform(X)

### 1.6. Scaling and standardizaation

In [9]:
ss = StandardScaler()
X = ss.fit_transform(X)

In [10]:
X

array([[-0.65465367,  1.22474487, -0.65465367,  0.75887436,  0.74947325],
       [ 1.52752523, -0.81649658, -0.65465367, -1.71150388, -1.43817841],
       [-0.65465367, -0.81649658,  1.52752523, -1.27555478, -0.89126549],
       [ 1.52752523, -0.81649658, -0.65465367, -0.11302384, -0.25320042],
       [-0.65465367, -0.81649658,  1.52752523,  0.17760889,  0.        ],
       [-0.65465367,  1.22474487, -0.65465367, -0.54897294, -0.52665688],
       [ 1.52752523, -0.81649658, -0.65465367,  0.        , -1.0735698 ],
       [-0.65465367,  1.22474487, -0.65465367,  1.34013983,  1.38753832],
       [-0.65465367, -0.81649658,  1.52752523,  1.63077256,  1.75214693],
       [-0.65465367,  1.22474487, -0.65465367, -0.25834021,  0.29371249]])

# 2. Data preparation on the test data set:

### 2.1. Import Necessary Libraries

In [44]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.compose import ColumnTransformer

### 2.1. Read Data


> The datasets consist of several medical predictor (independent) variables and one target (dependent) variable, Outcome. Independent variables include the number of pregnancies the patient has had, their BMI, insulin level, age, and so on.
- **Pregnancies**: Number of times pregnant
- **Glucose**: Plasma glucose concentration a 2 hours in an oral glucose tolerance test
- **BloodPressure**: Diastolic blood pressure (mm Hg)
- **SkinThickness**: Triceps skin fold thickness (mm)
- **Insulin**: 2-Hour serum insulin (mu U/ml)
- **BMI**: Body mass index (weight in kg/(height in m)^2)
- **DiabetesPedigreeFunction**: Diabetes pedigree function
- **Age**: Age (years)
- **Outcome**: Class variable (0 or 1) 268 of 768 are 1, the others are 0

In [12]:
pima_df = pd.read_csv('diabetes.csv')

In [13]:
pima_df.sample(3)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
382,1,109,60,8,182,25.4,0.947,21,0
164,0,131,88,0,0,31.6,0.743,32,1
737,8,65,72,23,0,32.0,0.6,42,0


### 2.2. Check dtypes

In [14]:
pima_df.dtypes

Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object

### 2.3. Detect and count missing values

In [15]:
# isna() and DataFrame. isnull()
# There is absolutely no difference - 
# the source code reveals that their implementations are exactly the same.
# Both are used to check for missing values ( NaN ).

pima_df.isna().sum()
pima_df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

### 2.4. Replace zero with specific values

In [16]:
pima_df.describe().style.background_gradient(cmap='YlOrRd')

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


#### 2.4.1. Mean

In [17]:
pima_df[pima_df['SkinThickness'] == 0].head(3)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
2,8,183,64,0,0,23.3,0.672,32,1
5,5,116,74,0,0,25.6,0.201,30,0
7,10,115,0,0,0,35.3,0.134,29,0


In [18]:
pima_df[pima_df['SkinThickness'] == 0].shape

(227, 9)

In [19]:
skin_thickness_mean = pima_df[pima_df['SkinThickness'] != 0]['SkinThickness'].mean()
skin_thickness_mean

29.153419593345657

In [20]:
pima_df.replace({'SkinThickness': 0}, skin_thickness_mean, inplace=True)

In [21]:
pima_df[pima_df['Insulin'] == 0].head(3)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35.0,0,33.6,0.627,50,1
1,1,85,66,29.0,0,26.6,0.351,31,0
2,8,183,64,29.15342,0,23.3,0.672,32,1


In [22]:
pima_df[pima_df['Insulin'] == 0].shape

(374, 9)

In [24]:
insulin_mean = pima_df[pima_df['Insulin'] != 0]['Insulin'].mean()
insulin_mean

155.5482233502538

In [27]:
pima_df.replace({'Insulin': 0}, insulin_mean, inplace=True)

In [28]:
pima_df[['SkinThickness', 'Insulin']].describe().style.background_gradient(cmap='YlOrRd')

Unnamed: 0,SkinThickness,Insulin
count,768.0,768.0
mean,29.15342,155.548223
std,8.790942,85.021108
min,7.0,14.0
25%,25.0,121.5
50%,29.15342,155.548223
75%,32.0,155.548223
max,99.0,846.0


### 2.5. KNNImputer

In [50]:
pima_df = pd.read_csv('diabetes.csv')

In [51]:
pima_df.sample(3)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
310,6,80,66,30,0,26.2,0.313,41,0
109,0,95,85,25,36,37.4,0.247,24,1
181,0,119,64,18,92,34.9,0.725,23,0


In [52]:
pima_df.replace({'SkinThickness': 0, 'Insulin': 0}, np.nan, inplace=True)

In [53]:
pima_df.sample(3)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
459,9,134,74,33.0,60.0,25.9,0.46,81,0
179,5,130,82,,,39.1,0.956,37,1
494,3,80,0,,,0.0,0.174,22,0


In [54]:
imputer = KNNImputer(n_neighbors=3)
pima_arr = imputer.fit_transform(pima_df)

> #### result not pandas Data Frame, result is numpy ndarray:

In [55]:
pima_arr

array([[  6.   , 148.   ,  72.   , ...,   0.627,  50.   ,   1.   ],
       [  1.   ,  85.   ,  66.   , ...,   0.351,  31.   ,   0.   ],
       [  8.   , 183.   ,  64.   , ...,   0.672,  32.   ,   1.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,   0.245,  30.   ,   0.   ],
       [  1.   , 126.   ,  60.   , ...,   0.349,  47.   ,   1.   ],
       [  1.   ,  93.   ,  70.   , ...,   0.315,  23.   ,   0.   ]])

#### To see the complete array, we can run the following command:
```
import sys
np.set_printoptions(threshhold=sys.max)

```

In [56]:
# import sys
# np.set_printoptions(threshold=sys.maxsize)

In [57]:
# pima_arr

### 2.6. Scaling and standardizaation

In [58]:
scaler = MinMaxScaler(feature_range=(0,1))
pima_arr = scaler.fit_transform(pima_arr)

In [59]:
pima_arr

array([[0.35294118, 0.74371859, 0.59016393, ..., 0.23441503, 0.48333333,
        1.        ],
       [0.05882353, 0.42713568, 0.54098361, ..., 0.11656704, 0.16666667,
        0.        ],
       [0.47058824, 0.91959799, 0.52459016, ..., 0.25362938, 0.18333333,
        1.        ],
       ...,
       [0.29411765, 0.6080402 , 0.59016393, ..., 0.07130658, 0.15      ,
        0.        ],
       [0.05882353, 0.63316583, 0.49180328, ..., 0.11571307, 0.43333333,
        1.        ],
       [0.05882353, 0.46733668, 0.57377049, ..., 0.10119556, 0.03333333,
        0.        ]])

In [60]:
# s_scaler = StandardScaler()
# pima_arr = s_scaler.fit_transform(pima_arr)