In [1]:
!ls

Data.csv  sample_data


**Step 1: Importing the libraries**

In [25]:
import pandas as pd
import numpy as np

**Step 2: Importing dataset**

In [26]:
df = pd.read_csv("Data.csv")

In [27]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [28]:
df.shape

(10, 4)

**Step 3: Handling the missing data**

In [29]:
df.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [30]:
# There are two missing data; one for Age and another for Salary.
# There are several methods of treating missing data.
# One could impute them using Mean, Median, or Mode.
# However, a better approach could be KNNimputation as implemented in scikit-learn.
# Here, I'm using Median imputation.

In [31]:
df = df.fillna(df.median())
df

  """Entry point for launching an IPython kernel.


Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,61000.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


**Step 4: Encoding categorical data**

In [32]:
# I'm aware of two types of encoding; label encoding and One-hot encoding
# "Country" Being a non-ordinal categorical feature, One-hot encoding should be the appropriate method.
# You can use Scikit-learn or get dummies method to achieve the task.
# I will use get dummies method here.

**Step 5: Creating a dummy variable**

In [33]:
X = df.drop("Purchased", axis=1)
y = df["Purchased"]
X.shape, y.shape

((10, 3), (10,))

In [34]:
X = pd.get_dummies(X)
X

Unnamed: 0,Age,Salary,Country_France,Country_Germany,Country_Spain
0,44.0,72000.0,1,0,0
1,27.0,48000.0,0,0,1
2,30.0,54000.0,0,1,0
3,38.0,61000.0,0,0,1
4,40.0,61000.0,0,1,0
5,35.0,58000.0,1,0,0
6,38.0,52000.0,0,0,1
7,48.0,79000.0,1,0,0
8,50.0,83000.0,0,1,0
9,37.0,67000.0,1,0,0


**Step 6: Splitting the datasets into training sets and Test sets**

In [35]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=.2, random_state=42)
X_train.shape, X_test.shape

((8, 5), (2, 5))

**Step 7: Feature Scaling**

In [36]:
# Age, Salay have very diverese range of data
# So, I will feature scale them to lie between similar range
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train[["Age", "Salary"]] = scaler.fit_transform(X_train[["Age", "Salary"]])
X_test[["Age", "Salary"]]  = scaler.transform(X_test[["Age", "Salary"]])

In [37]:
X_train

Unnamed: 0,Age,Salary,Country_France,Country_Germany,Country_Spain
5,-0.732798,-0.583212,1,0,0
0,1.025917,1.049781,1,0,0
7,1.807568,1.866278,1,0,0
2,-1.709862,-1.049781,0,1,0
9,-0.341972,0.466569,1,0,0
4,0.244266,-0.233285,0,1,0
3,-0.14656,-0.233285,0,0,1
6,-0.14656,-1.283066,0,0,1


In [38]:
X_test

Unnamed: 0,Age,Salary,Country_France,Country_Germany,Country_Spain
8,2.198394,2.332847,0,1,0
1,-2.2961,-1.749636,0,0,1


In [39]:
y_train

5    Yes
0     No
7    Yes
2     No
9    Yes
4    Yes
3     No
6     No
Name: Purchased, dtype: object