## Handling Missing Values

In [1]:
import pandas as pd
purchase=pd.read_csv('DataSet/purchase_data.csv')
purchase

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,,Yes


In [2]:
purchase.fillna(0)

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,0.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,0.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,0.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,0.0,Yes


In [3]:
purchase.fillna(method='ffill')

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,44.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,61000.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,35.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,83000.0,Yes


In [4]:
purchase.fillna(method='bfill')

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,30.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,58000.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,48.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,,Yes


In [5]:
purchase.interpolate()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,37.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,59500.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,41.5,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,83000.0,Yes


In [6]:
purchase.dropna()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
5,France,35.0,58000.0,Yes
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No


In [7]:
purchase.dropna(how='all')

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,,Yes


In [8]:
purchase.dropna(how='any')

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
5,France,35.0,58000.0,Yes
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No


## Handling Missing Values using Scikit-Learn

In [9]:
from sklearn.preprocessing import Imputer
data=Imputer(missing_values = 'NaN', strategy = "mean", axis = 0)




In [10]:
purchase

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,,Yes


In [11]:
data=data.fit(purchase.iloc[:,1:3])

In [12]:
purchase.iloc[:,1:3]=data.transform(purchase.iloc[:,1:3])
purchase

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,40.25,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63375.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,40.25,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,63375.0,Yes


In [13]:
purchase.fillna(purchase.mean())

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,40.25,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63375.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,40.25,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,63375.0,Yes


In [14]:
purchase.fillna(purchase.median())

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,40.25,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63375.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,40.25,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,63375.0,Yes


## Categorical Data

In [15]:
purchase 

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,40.25,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63375.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,40.25,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,63375.0,Yes


In [16]:
dummy=pd.get_dummies(purchase.Country)
dummy

Unnamed: 0,France,Germany,Spain
0,1,0,0
1,0,0,1
2,0,1,0
3,0,0,1
4,0,1,0
5,1,0,0
6,0,0,1
7,1,0,0
8,0,1,0
9,1,0,0


In [17]:
merge=pd.concat([purchase,dummy],axis=1)
merge

Unnamed: 0,Country,Age,Salary,Purchased,France,Germany,Spain
0,France,44.0,72000.0,No,1,0,0
1,Spain,40.25,48000.0,Yes,0,0,1
2,Germany,30.0,54000.0,No,0,1,0
3,Spain,38.0,61000.0,No,0,0,1
4,Germany,40.0,63375.0,Yes,0,1,0
5,France,35.0,58000.0,Yes,1,0,0
6,Spain,40.25,52000.0,No,0,0,1
7,France,48.0,79000.0,Yes,1,0,0
8,Germany,50.0,83000.0,No,0,1,0
9,France,37.0,63375.0,Yes,1,0,0


In [18]:
 pd.get_dummies(purchase, columns = ['Country'], drop_first = True)

Unnamed: 0,Age,Salary,Purchased,Country_Germany,Country_Spain
0,44.0,72000.0,No,0,0
1,40.25,48000.0,Yes,0,1
2,30.0,54000.0,No,1,0
3,38.0,61000.0,No,0,1
4,40.0,63375.0,Yes,1,0
5,35.0,58000.0,Yes,0,0
6,40.25,52000.0,No,0,1
7,48.0,79000.0,Yes,0,0
8,50.0,83000.0,No,1,0
9,37.0,63375.0,Yes,0,0


In [19]:
a= merge.drop(['Country','France'], axis = 1)
a

Unnamed: 0,Age,Salary,Purchased,Germany,Spain
0,44.0,72000.0,No,0,0
1,40.25,48000.0,Yes,0,1
2,30.0,54000.0,No,1,0
3,38.0,61000.0,No,0,1
4,40.0,63375.0,Yes,1,0
5,35.0,58000.0,Yes,0,0
6,40.25,52000.0,No,0,1
7,48.0,79000.0,Yes,0,0
8,50.0,83000.0,No,1,0
9,37.0,63375.0,Yes,0,0


## Dummy Variables using LabelEncoder

In [40]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [63]:
d=purchase
d.Country=le.fit_transform(purchase.Country)
d.Purchased=le.fit_transform(purchase.Purchased)
d

Unnamed: 0,Country,Age,Salary,Purchased
0,0,44.0,72000.0,0
1,2,40.25,48000.0,1
2,1,30.0,54000.0,0
3,2,38.0,61000.0,0
4,1,40.0,63375.0,1
5,0,35.0,58000.0,1
6,2,40.25,52000.0,0
7,0,48.0,79000.0,1
8,1,50.0,83000.0,0
9,0,37.0,63375.0,1


In [64]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(categorical_features = [0])
x = ohe.fit_transform(purchase).toarray() 
x

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[1.0000e+00, 0.0000e+00, 0.0000e+00, 4.4000e+01, 7.2000e+04,
        0.0000e+00],
       [0.0000e+00, 0.0000e+00, 1.0000e+00, 4.0250e+01, 4.8000e+04,
        1.0000e+00],
       [0.0000e+00, 1.0000e+00, 0.0000e+00, 3.0000e+01, 5.4000e+04,
        0.0000e+00],
       [0.0000e+00, 0.0000e+00, 1.0000e+00, 3.8000e+01, 6.1000e+04,
        0.0000e+00],
       [0.0000e+00, 1.0000e+00, 0.0000e+00, 4.0000e+01, 6.3375e+04,
        1.0000e+00],
       [1.0000e+00, 0.0000e+00, 0.0000e+00, 3.5000e+01, 5.8000e+04,
        1.0000e+00],
       [0.0000e+00, 0.0000e+00, 1.0000e+00, 4.0250e+01, 5.2000e+04,
        0.0000e+00],
       [1.0000e+00, 0.0000e+00, 0.0000e+00, 4.8000e+01, 7.9000e+04,
        1.0000e+00],
       [0.0000e+00, 1.0000e+00, 0.0000e+00, 5.0000e+01, 8.3000e+04,
        0.0000e+00],
       [1.0000e+00, 0.0000e+00, 0.0000e+00, 3.7000e+01, 6.3375e+04,
        1.0000e+00]])

## Splitting the data 

In [44]:
purchase

Unnamed: 0,Country,Age,Salary,Purchased
0,0,44.0,72000.0,No
1,2,40.25,48000.0,Yes
2,1,30.0,54000.0,No
3,2,38.0,61000.0,No
4,1,40.0,63375.0,Yes
5,0,35.0,58000.0,Yes
6,2,40.25,52000.0,No
7,0,48.0,79000.0,Yes
8,1,50.0,83000.0,No
9,0,37.0,63375.0,Yes


In [47]:
 X=purchase.iloc[:,0:3] #input variables
X

Unnamed: 0,Country,Age,Salary
0,0,44.0,72000.0
1,2,40.25,48000.0
2,1,30.0,54000.0
3,2,38.0,61000.0
4,1,40.0,63375.0
5,0,35.0,58000.0
6,2,40.25,52000.0
7,0,48.0,79000.0
8,1,50.0,83000.0
9,0,37.0,63375.0


In [49]:
Y=purchase.Purchased
Y

0     No
1    Yes
2     No
3     No
4    Yes
5    Yes
6     No
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object

In [50]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size= 0.2, random_state = 0)

In [51]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(8, 3)
(2, 3)
(8,)
(2,)


## Feature Scaling

In [65]:
from sklearn.preprocessing import scale
A = scale(purchase)
A

  


array([[-1.08347268,  0.66551738,  0.7900303 , -1.        ],
       [ 1.32424438,  0.        , -1.40831488,  1.        ],
       [ 0.12038585, -1.81908084, -0.85872859, -1.        ],
       [ 1.32424438, -0.39931043, -0.21754458, -1.        ],
       [ 0.12038585, -0.04436783,  0.        ,  1.        ],
       [-1.08347268, -0.93172433, -0.49233772,  1.        ],
       [ 1.32424438,  0.        , -1.04192402, -1.        ],
       [-1.08347268,  1.37540259,  1.43121431,  1.        ],
       [ 0.12038585,  1.73034519,  1.79760518, -1.        ],
       [-1.08347268, -0.57678173,  0.        ,  1.        ]])

In [66]:
A=pd.DataFrame(A)
A

Unnamed: 0,0,1,2,3
0,-1.083473,0.665517,0.79003,-1.0
1,1.324244,0.0,-1.408315,1.0
2,0.120386,-1.819081,-0.858729,-1.0
3,1.324244,-0.39931,-0.217545,-1.0
4,0.120386,-0.044368,0.0,1.0
5,-1.083473,-0.931724,-0.492338,1.0
6,1.324244,0.0,-1.041924,-1.0
7,-1.083473,1.375403,1.431214,1.0
8,0.120386,1.730345,1.797605,-1.0
9,-1.083473,-0.576782,0.0,1.0


In [67]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
B = sc.fit_transform(purchase)
B

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


array([[-1.08347268,  0.66551738,  0.7900303 , -1.        ],
       [ 1.32424438,  0.        , -1.40831488,  1.        ],
       [ 0.12038585, -1.81908084, -0.85872859, -1.        ],
       [ 1.32424438, -0.39931043, -0.21754458, -1.        ],
       [ 0.12038585, -0.04436783,  0.        ,  1.        ],
       [-1.08347268, -0.93172433, -0.49233772,  1.        ],
       [ 1.32424438,  0.        , -1.04192402, -1.        ],
       [-1.08347268,  1.37540259,  1.43121431,  1.        ],
       [ 0.12038585,  1.73034519,  1.79760518, -1.        ],
       [-1.08347268, -0.57678173,  0.        ,  1.        ]])

In [68]:
B=pd.DataFrame(B)
B

Unnamed: 0,0,1,2,3
0,-1.083473,0.665517,0.79003,-1.0
1,1.324244,0.0,-1.408315,1.0
2,0.120386,-1.819081,-0.858729,-1.0
3,1.324244,-0.39931,-0.217545,-1.0
4,0.120386,-0.044368,0.0,1.0
5,-1.083473,-0.931724,-0.492338,1.0
6,1.324244,0.0,-1.041924,-1.0
7,-1.083473,1.375403,1.431214,1.0
8,0.120386,1.730345,1.797605,-1.0
9,-1.083473,-0.576782,0.0,1.0
