In [63]:
import pandas as pd
import numpy as np

import snowflake.connector
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import RobustScaler, OneHotEncoder

In [51]:
from collections import Counter
from imblearn.over_sampling import SMOTENC

In [1]:
import pandas as pd

In [13]:
df = pd.read_csv("../../input_data/iris2.csv", header=0)

In [14]:
df.head()

Unnamed: 0,Id,SepalLength,SepalWidth,PetalLength,PetalWidth,Color,Species
0,1,5.1,3.5,1.4,0.2,Red,setosa
1,2,4.9,3.0,1.4,0.2,Blue,setosa
2,3,4.7,3.2,1.3,0.2,Blue,setosa
3,4,4.6,3.1,1.5,0.2,Red,setosa
4,5,,,1.4,0.2,Blue,setosa


<br/>
<br/>

SEPARATE NUMERICAL AND CATEGORICAL COLUMNS

In [39]:
id_columns = ["Id"]
numeric_features = ["SepalLength", "SepalWidth", "PetalLength", "PetalWidth"]
categorical_features = ["Color"]

<br/>
<br/>
<br/>
<br/>
<br/>




* ~~X and y~~
* ~~TRAIN TEST SPLIT~~
* ~~IMPUTE~~
* ~~Handling class IMBALANCE~~
* ~~Magical COLUMN TRANSFORMER~~
    * ~~ENCODE and SCALE (cat and num columns) in a single go~~
* MODEL TRAIN 
* PREDICT

<br/>
<br/>


### X and Y

In [36]:
X = df.drop(id_columns + ['Species'], axis = 1)

y = df.Species

<br/>
<br/>

#### Train-Test Split
We do this before scaling/feature transformation to prevent data leakage from test set into training data

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2)

num > impute > balance > scale   
cat > balance > encode 

<br/>
<br/>


### Impute

In [25]:
imputer = SimpleImputer(strategy='median', missing_values=np.nan)

In [41]:
X_imputed = imputer.fit_transform(X_train[numeric_features].values)

In [47]:
X_train_imputed = pd.DataFrame(X_imputed, index=X_train.index, columns=X_train[numeric_features].columns)

In [50]:
# Reconstructing the dataset for balancing 
X_train_imputed["Color"] = X_train.Color
X_train_imputed.head()

Unnamed: 0,SepalLength,SepalWidth,PetalLength,PetalWidth,Color
36,5.5,3.5,1.3,0.2,Red
31,5.4,3.4,1.5,0.4,Yellow
8,4.4,2.9,1.4,0.2,Green
17,5.1,3.5,1.4,0.3,Green
6,4.6,3.4,1.4,0.3,Green


<br/>
<br/>

### Class Imbalance
* Synthetic Minority Over-sampling Technique for Nominal and Continuous (SMOTE-NC)  
* Use SMOTE for just continuous data

In [81]:
print('Original dataset samples per class {}'.format(Counter(y_train)))

Original dataset samples per class Counter({'setosa': 40, 'versicolor': 7})


In [57]:
sm = SMOTENC(random_state=42
             ,categorical_features=[4]
             ,sampling_strategy = 'minority')

In [59]:
X_bal, y_bal = sm.fit_resample(X_train_imputed, y_train)

In [60]:
print('Resampled dataset samples per class {}'.format(Counter(y_bal)))

Resampled dataset samples per class Counter({'setosa': 40, 'versicolor': 40})


In [62]:
X_bal.shape, y_bal.shape # more data was created 

((80, 5), (80,))

<br/>
<br/>


#### Encoding (categorical features) and Scaling (numeric features)

In [64]:
oh_encoder = OneHotEncoder()
rob_scaler = RobustScaler() ## choose your scaler depending on the kind of data you have
# mine has a lot of outliers. 

In [65]:
from sklearn.compose import ColumnTransformer

In [77]:
column_transformer = ColumnTransformer(
    transformers = [
        ('ohe', oh_encoder, [4]), # for a dataframe use categorical_features
        ('sca', rob_scaler, [0,1,2,3]) # for a dataframe use numeric_features
    ]
)

In [78]:
X_final = column_transformer.fit_transform(X_bal)

In [80]:
X_final.shape # four numerical scaled columns, 4 new categorical columns (one for every category)

(80, 8)

<br/>
<br/>

PREPROCESSING COMPLETE !!  👻

<br/>
<br/>


In [84]:
X_bal.shape

(80, 5)