In [16]:
#Data.csv

**Step 1: Importing the libraries**

In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [18]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

**Step 2: Importing dataset**

The dataset is imported from the google drive.

In [19]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
df = pd.read_csv('/content/drive/MyDrive/Datasets/Data.csv')
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


Once the dataset is imported, we are going to create a matrix of independent features (X) and a dependent vector of target (y) with their respective observations.

In [21]:
# Categorizing into dependent and independent variable

x = df.iloc[:,:-1].values
y = df.iloc[:,3].values

**Step 3: Handling the missing data**

First we shall check the data if there are any missing values using "isnull.sum"

In [22]:
# To check for missing data
df.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [23]:
# Since the dataset is less, we can fill the dataset by mean instead of delecting it.
# We use sklearn SimpleImputer method to do this on numerical variables only.

imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imputer = imputer.fit(x[:,1:3])
x[:,1:3] = imputer.transform(x[:,1:3])

1. In the above, the object name is SimpleImputer for filling up the missing values with respective strategy. 
2. The object has been given with two parameters, one is missing_values which is a placeholder for missing value. Second one is the strategy. We have imposed mean as the strategy.
3. The object is then fitted and transformed for the respective features.

**Step 4: Creating a dummy variable**

There are two different methods of encoding a categorical variable.
1. Label Encoder and
2. One Hot Encoder

Dummy variable is a concept of One Hot Encoder. Before getting into dummy variable concept, lets do Label Encoder first and then address the problem with that.

Label Encoder

In [24]:
labelEncoder_x = LabelEncoder()
x[:, 0] = labelEncoder_x.fit_transform(x[:,0])
x

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

From the above Label Encoder process, we see a problem. Lets address it.

1. Label Encoder labels the unique value of categorical data in an alphabetical order. Here, the categorical data "Country" is labelled as France = 0, Germany = 1 and Spain = 2.
2. Due to this, there is a high probability that the model captures the relationship between countries such as France < Germany < Spain which is a problem.
3. To avoid this, we use OneHotEncoder class. This creates an additional features based on the unique values in the categorical feature. Every unique value in the category will be added as a feature. 

One Hot Encoder

In [25]:
# The categorical data is encoded using One Hot Encoder

ct = ColumnTransformer(transformers = [('encoder', OneHotEncoder(), [0])], remainder = 'passthrough')
x = np.array(ct.fit_transform(x))
x

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

**Step 5: Encoding a categorical data**

The target variable is also categorical with "Yes" or "No" variables. Since it has only two variables, we use LabelEncoding which replaces 1 with Yes and 0 with No. Doing with One Hot Encoder will create two feature which is not suitable as a target variable.

Label Encoder

In [26]:
# The dependent variable Purchased is now Label Encoded
label_y = LabelEncoder()
y = label_y.fit_transform(y)
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

**Step 6: Splitting the datasets into training sets and Test sets**

In [27]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

**Step 7: Feature Scaling**

In [28]:
sc_x = StandardScaler()
X_train = sc_x.fit_transform(X_train)
X_test = sc_x.transform(X_test)

In [29]:
X_train

array([[-1.        ,  2.64575131, -0.77459667,  0.26306757,  0.12381479],
       [ 1.        , -0.37796447, -0.77459667, -0.25350148,  0.46175632],
       [-1.        , -0.37796447,  1.29099445, -1.97539832, -1.53093341],
       [-1.        , -0.37796447,  1.29099445,  0.05261351, -1.11141978],
       [ 1.        , -0.37796447, -0.77459667,  1.64058505,  1.7202972 ],
       [-1.        , -0.37796447,  1.29099445, -0.0813118 , -0.16751412],
       [ 1.        , -0.37796447, -0.77459667,  0.95182631,  0.98614835],
       [ 1.        , -0.37796447, -0.77459667, -0.59788085, -0.48214934]])

In [30]:
X_test

array([[-1.        ,  2.64575131, -0.77459667, -1.45882927, -0.90166297],
       [-1.        ,  2.64575131, -0.77459667,  1.98496442,  2.13981082]])