# Data Preprocessing 

## Importing the libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Importing the Dataset

In [2]:
dataset = pd.read_csv("Data.csv")

In [3]:
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


## Taking care of Missing data

In [4]:
dataset['Age'] = dataset['Age'].fillna(dataset['Age'].mean())
dataset['Salary'] = dataset['Salary'].fillna(dataset['Salary'].mean())

In [5]:
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


## Encoding Categorical data

In [6]:
dataset = pd.get_dummies(dataset,columns = ["Country"])
dataset = dataset.replace({'Purchased' : {'Yes' : 1 , 'No' : 0}})

In [7]:
dataset

Unnamed: 0,Age,Salary,Purchased,Country_France,Country_Germany,Country_Spain
0,44.0,72000.0,0,1,0,0
1,27.0,48000.0,1,0,0,1
2,30.0,54000.0,0,0,1,0
3,38.0,61000.0,0,0,0,1
4,40.0,63777.777778,1,0,1,0
5,35.0,58000.0,1,1,0,0
6,38.777778,52000.0,0,0,0,1
7,48.0,79000.0,1,1,0,0
8,50.0,83000.0,0,0,1,0
9,37.0,67000.0,1,1,0,0


In [8]:
x = dataset.drop('Purchased',axis = 1)

In [9]:
x

Unnamed: 0,Age,Salary,Country_France,Country_Germany,Country_Spain
0,44.0,72000.0,1,0,0
1,27.0,48000.0,0,0,1
2,30.0,54000.0,0,1,0
3,38.0,61000.0,0,0,1
4,40.0,63777.777778,0,1,0
5,35.0,58000.0,1,0,0
6,38.777778,52000.0,0,0,1
7,48.0,79000.0,1,0,0
8,50.0,83000.0,0,1,0
9,37.0,67000.0,1,0,0


In [10]:
y = dataset['Purchased']

In [11]:
y

0    0
1    1
2    0
3    0
4    1
5    1
6    0
7    1
8    0
9    1
Name: Purchased, dtype: int64

## Splitting the dataset into the Training set and Test set

In [12]:
from sklearn.model_selection import train_test_split
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size = 0.2,random_state = 1)

In [13]:
x_train

Unnamed: 0,Age,Salary,Country_France,Country_Germany,Country_Spain
6,38.777778,52000.0,0,0,1
4,40.0,63777.777778,0,1,0
0,44.0,72000.0,1,0,0
3,38.0,61000.0,0,0,1
1,27.0,48000.0,0,0,1
7,48.0,79000.0,1,0,0
8,50.0,83000.0,0,1,0
5,35.0,58000.0,1,0,0


In [14]:
y_train

6    0
4    1
0    0
3    0
1    1
7    1
8    0
5    1
Name: Purchased, dtype: int64

In [15]:
x_test

Unnamed: 0,Age,Salary,Country_France,Country_Germany,Country_Spain
2,30.0,54000.0,0,1,0
9,37.0,67000.0,1,0,0


In [16]:
y_test

2    0
9    1
Name: Purchased, dtype: int64

## Feature Scaling

In [17]:
x_train = (x_train-np.mean(x_train))/np.std(x_train)
x_test = (x_test-np.mean(x_test))/np.std(x_test)

In [18]:
x_train

Unnamed: 0,Age,Salary,Country_France,Country_Germany,Country_Spain
6,-0.191592,-1.078126,-0.774597,-0.57735,1.290994
4,-0.014117,-0.070132,-0.774597,1.732051,-0.774597
0,0.566709,0.633562,1.290994,-0.57735,-0.774597
3,-0.30453,-0.307866,-0.774597,-0.57735,1.290994
1,-1.901801,-1.420464,-0.774597,-0.57735,1.290994
7,1.147534,1.232653,1.290994,-0.57735,-0.774597
8,1.437947,1.574991,-0.774597,1.732051,-0.774597
5,-0.74015,-0.564619,1.290994,-0.57735,-0.774597


In [19]:
x_test

Unnamed: 0,Age,Salary,Country_France,Country_Germany,Country_Spain
2,-1.0,-1.0,-1.0,1.0,
9,1.0,1.0,1.0,-1.0,
