# Data Preprocessing

## Importing Data

In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from google.colab import files
files.upload()

Saving Data.csv to Data.csv


{'Data.csv': b'Country,Age,Salary,Purchased\r\nFrance,44,72000,No\r\nSpain,27,48000,Yes\r\nGermany,30,54000,No\r\nSpain,38,61000,No\r\nGermany,40,,Yes\r\nFrance,35,58000,Yes\r\nSpain,,52000,No\r\nFrance,48,79000,Yes\r\nGermany,50,83000,No\r\nFrance,37,67000,Yes'}

In [25]:
data = pd.read_csv('Data.csv')
print(data)

   Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes
5   France  35.0  58000.0       Yes
6    Spain   NaN  52000.0        No
7   France  48.0  79000.0       Yes
8  Germany  50.0  83000.0        No
9   France  37.0  67000.0       Yes


In [26]:

X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values


## Handling Missing Data

- We will do this using the `mean` option.

In [17]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy='mean')
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

## Encode categorical variable
We will use the `OneHotEncoder` to ensure some categories are given more importance

In [18]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

## Encode dependent variable

In [19]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

## Split data into Train and Test data

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

## Feature scaling
- I used the Standardization option (which transforms the variables to have a range between -2 and 2) because it works with all data

- I did not use the feature scaling on the dummy variable because the values are already in a small range (0-1). Applying feature scaling makes the interpretation hard.

In [27]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
# Fit will just get the mean and sd, transform uses the formula
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])

# We want to use the same scaler on both the train and test data, hence we use only the transform method
X_test[:, 3:] = sc.transform(X_test[:, 3:])

In [28]:
print(X_train)

[[0.0 0.0 1.0 -0.19159184384578545 -1.0781259408412425]
 [0.0 1.0 0.0 -0.014117293757057777 -0.07013167641635372]
 [1.0 0.0 0.0 0.566708506533324 0.633562432710455]
 [0.0 0.0 1.0 -0.30453019390224867 -0.30786617274297867]
 [0.0 0.0 1.0 -1.9018011447007988 -1.420463615551582]
 [1.0 0.0 0.0 1.1475343068237058 1.232653363453549]
 [0.0 1.0 0.0 1.4379472069688968 1.5749910381638885]
 [1.0 0.0 0.0 -0.7401495441200351 -0.5646194287757332]]


In [29]:
print(X_test)

[[0.0 1.0 0.0 -1.4661817944830124 -0.9069571034860727]
 [1.0 0.0 0.0 -0.44973664397484414 0.2056403393225306]]
