# data preprocessing

In [2]:
# using sklearn==0.18.2 and scipy==1.2.3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

In [2]:
data_set = pd.read_csv('Data.csv')

In [3]:
data_set

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


## iloc is same as numpy's slicing

In [4]:
X= data_set.iloc[:,:-1].values

In [5]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [6]:
Xx = np.array(X)

In [7]:
Y = data_set.iloc[:,-1:].values

In [8]:
Y

array([['No'],
       ['Yes'],
       ['No'],
       ['No'],
       ['Yes'],
       ['Yes'],
       ['No'],
       ['Yes'],
       ['No'],
       ['Yes']], dtype=object)

## managing the NaNs

In [9]:
from sklearn.preprocessing import  Imputer
imputer = Imputer(missing_values=np.nan, strategy='mean',axis= 0) #axis: 0- column, 1- row
# strategy can be : men, median, most freq

In [10]:
imputer = imputer.fit(X[:,1:])

In [11]:
X[:,1:] = imputer.transform(X[:,1:])

In [12]:
Xx = np.array(X)
Xx

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

## categorical data

In [13]:
from sklearn.preprocessing import LabelEncoder  as le

In [14]:
laben_X = le()

In [15]:
X[:,0]

array(['France', 'Spain', 'Germany', 'Spain', 'Germany', 'France',
       'Spain', 'France', 'Germany', 'France'], dtype=object)

In [16]:
X[:,0] = laben_X.fit_transform(X[:,0])

In [17]:
X

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

## dummy var - name like row values to different columns

### because ml algos may assume spain>france

In [18]:
from sklearn.preprocessing import OneHotEncoder  as ohe

In [19]:
OHE_x = ohe(categorical_features=[0])

In [20]:
X = OHE_x.fit_transform(X).toarray()

In [21]:
X

array([[1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.40000000e+01,
        7.20000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 2.70000000e+01,
        4.80000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 3.00000000e+01,
        5.40000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.80000000e+01,
        6.10000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.00000000e+01,
        6.37777778e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.50000000e+01,
        5.80000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.87777778e+01,
        5.20000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80000000e+01,
        7.90000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 5.00000000e+01,
        8.30000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.70000000e+01,
        6.70000000e+04]])

## categorical data  in Y

In [22]:
laben_y = le()

In [23]:
Y

array([['No'],
       ['Yes'],
       ['No'],
       ['No'],
       ['Yes'],
       ['Yes'],
       ['No'],
       ['Yes'],
       ['No'],
       ['Yes']], dtype=object)

In [24]:
Y = laben_X.fit_transform(Y)

  y = column_or_1d(y, warn=True)


In [25]:
Y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1], dtype=int32)

# train test split

In [26]:
from sklearn.cross_validation import train_test_split



In [27]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=.2,random_state = 0)

In [32]:
# Y_train

# Feature Scaling

### variables need to be on the same scale so that no one variable dominates over other 
#### standardisation => (x-avg(x))/std(x)  ; range- {-1,1}
#### normalisation => (x-min(x))/(max(x)-min(x)) ; range- {0,1}

In [34]:
from sklearn.preprocessing import StandardScaler as stdsc

In [35]:
sc_X =  stdsc()

In [36]:
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)