# Data Pre-processing

# Step 1: Import the libraries

In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Step 2: Import dataset

In [18]:
dataset = pd.read_csv('Day3.csv')
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [19]:
#Independent variable
X = dataset.iloc[:,:-1].values

#Dependent variable
y = dataset.iloc[:,3].values

In [20]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [21]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

# Step 3: Handling the missing data

In [24]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])


In [25]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

# Step 4: Encoding categorical data

In [26]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()
X[:,0] = labelencoder_X.fit_transform(X[:,0])

In [27]:
X

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

In [28]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
columntransformer = ColumnTransformer([('encoder', OneHotEncoder(),[0])], remainder='passthrough')
X = np.array(columntransformer.fit_transform(X),dtype=np.str)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  X = np.array(columntransformer.fit_transform(X),dtype=np.str)


In [29]:
X

array([['1.0', '0.0', '0.0', '44.0', '72000.0'],
       ['0.0', '0.0', '1.0', '27.0', '48000.0'],
       ['0.0', '1.0', '0.0', '30.0', '54000.0'],
       ['0.0', '0.0', '1.0', '38.0', '61000.0'],
       ['0.0', '1.0', '0.0', '40.0', '63777.77777777778'],
       ['1.0', '0.0', '0.0', '35.0', '58000.0'],
       ['0.0', '0.0', '1.0', '38.77777777777778', '52000.0'],
       ['1.0', '0.0', '0.0', '48.0', '79000.0'],
       ['0.0', '1.0', '0.0', '50.0', '83000.0'],
       ['1.0', '0.0', '0.0', '37.0', '67000.0']], dtype='<U17')

# Step 5: Splitting the dataset into training sets and test sets

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=1/4, random_state=0)

In [31]:
X_train

array([['1.0', '0.0', '0.0', '37.0', '67000.0'],
       ['0.0', '0.0', '1.0', '27.0', '48000.0'],
       ['0.0', '0.0', '1.0', '38.77777777777778', '52000.0'],
       ['1.0', '0.0', '0.0', '48.0', '79000.0'],
       ['0.0', '0.0', '1.0', '38.0', '61000.0'],
       ['1.0', '0.0', '0.0', '44.0', '72000.0'],
       ['1.0', '0.0', '0.0', '35.0', '58000.0']], dtype='<U17')

In [32]:
X_test

array([['0.0', '1.0', '0.0', '30.0', '54000.0'],
       ['0.0', '1.0', '0.0', '50.0', '83000.0'],
       ['0.0', '1.0', '0.0', '40.0', '63777.77777777778']], dtype='<U17')

In [36]:
y_train

array(['Yes', 'Yes', 'No', 'Yes', 'No', 'No', 'Yes'], dtype=object)

In [37]:
y_test

array(['No', 'No', 'Yes'], dtype=object)

# Step 6: Feature Scaling

In [33]:
from sklearn.preprocessing import StandardScaler
scale  = StandardScaler()
X_train = scale.fit_transform(X_train)
X_test = scale.fit_transform(X_test)

In [34]:
X_train

array([[ 0.8660254 ,  0.        , -0.8660254 , -0.2029809 ,  0.44897083],
       [-1.15470054,  0.        ,  1.15470054, -1.82168936, -1.41706417],
       [-1.15470054,  0.        ,  1.15470054,  0.08478949, -1.0242147 ],
       [ 0.8660254 ,  0.        , -0.8660254 ,  1.5775984 ,  1.62751925],
       [-1.15470054,  0.        ,  1.15470054, -0.04111006, -0.14030338],
       [ 0.8660254 ,  0.        , -0.8660254 ,  0.93011502,  0.94003267],
       [ 0.8660254 ,  0.        , -0.8660254 , -0.52672259, -0.43494049]])

In [35]:
X_test

array([[ 0.        ,  0.        ,  0.        , -1.22474487, -1.07298811],
       [ 0.        ,  0.        ,  0.        ,  1.22474487,  1.33431759],
       [ 0.        ,  0.        ,  0.        ,  0.        , -0.26132948]])

In [38]:
y_train

array(['Yes', 'Yes', 'No', 'Yes', 'No', 'No', 'Yes'], dtype=object)

In [39]:
y_test

array(['No', 'No', 'Yes'], dtype=object)