# Preporcessing data
Dealing with categorical features and convert to numerical

In [1]:
# import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot') # style used

## Using dummy variables

In [2]:
df = pd.read_csv('data/csv/auto.csv')
df.head()

Unnamed: 0,mpg,displ,hp,weight,accel,origin,size
0,18.0,250.0,88,3139,14.5,US,15.0
1,9.0,304.0,193,4732,18.5,US,20.0
2,36.1,91.0,60,1800,16.4,Asia,10.0
3,18.5,250.0,98,3525,19.0,US,15.0
4,34.3,97.0,78,2188,15.8,Europe,10.0


In [3]:
# Dummy variables from origin column and drop first category
df_origin = pd.get_dummies(df, drop_first=True)
df_origin.head()

Unnamed: 0,mpg,displ,hp,weight,accel,size,origin_Europe,origin_US
0,18.0,250.0,88,3139,14.5,15.0,0,1
1,9.0,304.0,193,4732,18.5,20.0,0,1
2,36.1,91.0,60,1800,16.4,10.0,0,0
3,18.5,250.0,98,3525,19.0,15.0,0,1
4,34.3,97.0,78,2188,15.8,10.0,1,0


## Linear Regression with Dummy Variables

In [4]:
X = df_origin.drop(['origin_Europe', 'origin_US'], axis=1).values
y = df_origin[['origin_Europe', 'origin_US']].values
# y = y.reshape(-1, 1)

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

param_grid = {'alpha': np.linspace(0, 0.99, 11)}

# initialize ridge
ridge = Ridge()

# performs CV with grid search
ridge_cv = GridSearchCV(ridge, param_grid, cv=5)

# fit the data
ridge_cv.fit(X_train, y_train)

# best parameters used
print(ridge_cv.best_params_)

# best score
print(ridge_cv.best_score_)

{'alpha': 0.99}
0.27286721434603184


## Handling missing data

In [6]:
# load dataset
filepath = 'data/csv/diabetes.csv'
diabetes_df = pd.read_csv(filepath)
diabetes_df.head()
col = diabetes_df.columns

In [7]:
# replace 0 values with NaN
diabetes_df.insulin.replace(0, np.nan, inplace=True)
diabetes_df.triceps.replace(0, np.nan, inplace=True)
diabetes_df.bmi.replace(0, np.nan, inplace=True)
diabetes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   pregnancies  768 non-null    int64  
 1   glucose      768 non-null    int64  
 2   diastolic    768 non-null    int64  
 3   triceps      541 non-null    float64
 4   insulin      394 non-null    float64
 5   bmi          757 non-null    float64
 6   dpf          768 non-null    float64
 7   age          768 non-null    int64  
 8   diabetes     768 non-null    int64  
dtypes: float64(4), int64(5)
memory usage: 54.1 KB


In [8]:
X = diabetes_df.drop('diabetes', axis=1) # features
y = diabetes_df['diabetes'] # target

## Imputing with pipeline

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

# create logistic regression
logreg = LogisticRegression()

# replace missing value with data mean
# create an imputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')

# create a pipeline object
steps = [('imputation', imp), ('logistic_regression', logreg)]
pipeline = Pipeline(steps)

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

# fit data into pipeline
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
pipeline.score(X_test, y_test)

0.7402597402597403

## Centering and Scaling data

In [10]:
imp.fit(diabetes_df)
diabetes_df = imp.transform(diabetes_df)

In [11]:
diabetes_df = pd.DataFrame(diabetes_df, columns=col)

In [12]:
diabetes_df.head()

Unnamed: 0,pregnancies,glucose,diastolic,triceps,insulin,bmi,dpf,age,diabetes
0,6.0,148.0,72.0,35.0,155.548223,33.6,0.627,50.0,1.0
1,1.0,85.0,66.0,29.0,155.548223,26.6,0.351,31.0,0.0
2,8.0,183.0,64.0,29.15342,155.548223,23.3,0.672,32.0,1.0
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0.0
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1.0


In [13]:
diabetes_df.describe()

Unnamed: 0,pregnancies,glucose,diastolic,triceps,insulin,bmi,dpf,age,diabetes
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,29.15342,155.548223,32.457464,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,8.790942,85.021108,6.875151,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,1.0,99.0,62.0,25.0,121.5,27.5,0.24375,24.0,0.0
50%,3.0,117.0,72.0,29.15342,155.548223,32.4,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,155.548223,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [14]:
X = diabetes_df.drop('diabetes', axis=1) # features
y = diabetes_df['diabetes'] # target

In [15]:
# import library
from sklearn.preprocessing import scale
X_scaled = scale(X)

In [16]:
print(np.mean(X))
print(np.std(X))

pregnancies      3.845052
glucose        120.894531
diastolic       69.105469
triceps         29.153420
insulin        155.548223
bmi             32.457464
dpf              0.471876
age             33.240885
dtype: float64
pregnancies     3.367384
glucose        31.951796
diastolic      19.343202
triceps         8.785217
insulin        84.965737
bmi             6.870674
dpf             0.331113
age            11.752573
dtype: float64


In [17]:
print(np.mean(X_scaled))
print(np.std(X_scaled))

1.4976446009266434e-16
1.0


## Scaling in a pipeline

In [35]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# create a pipeline object
steps = [('scaler', StandardScaler()), ('knn', KNeighborsClassifier())]
pipeline = Pipeline(steps)

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 21)

knn_scaled = pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
knn_scaled.score(X_test, y_test)

0.7272727272727273

In [19]:
# without scaling
knn_unscaled = KNeighborsClassifier().fit(X_train, y_train)
knn_unscaled.score(X_test, y_test)

0.6753246753246753

## Hyperparameters CV and Scaling in a pipeline

In [36]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# create a pipeline object
steps = [('scaler', StandardScaler()), ('knn', KNeighborsClassifier())]
pipeline = Pipeline(steps)

# hyperparameters
parameters = {'knn__n_neighbors': np.arange(1,50)}

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 21)

# create cross validating
cv = GridSearchCV(pipeline, parameters)
cv.fit(X_train, y_train)
y_pred = cv.predict(X_test)

# best parameters used
print(cv.best_params_)

# best score
print(cv.best_score_)

# classification report
print(classification_report(y_test, y_pred))

{'knn__n_neighbors': 33}
0.7686125549780088
              precision    recall  f1-score   support

         0.0       0.69      0.90      0.78        94
         1.0       0.70      0.35      0.47        60

    accuracy                           0.69       154
   macro avg       0.69      0.63      0.62       154
weighted avg       0.69      0.69      0.66       154

