In [None]:
#Data.csv

**Step 1: Importing the libraries**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


**Step 2: Importing dataset**

In [2]:
data = pd.read_csv('Data.csv')
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [5]:
data.shape

(10, 4)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 448.0+ bytes


**Step 3: Handling the missing data**

In [7]:
data.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [18]:
a = data['Age'].median()
data['Age'].fillna(value = a, inplace = True)

In [19]:
b = data['Salary'].mean()
data['Salary'].fillna(value = b, inplace = True)

In [20]:
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


**Step 4: Encoding categorical data**

In [21]:
data_u = data.nunique().to_frame().reset_index()
data_u.columns = ['Variable','DistinctCount']
data_u

Unnamed: 0,Variable,DistinctCount
0,Country,3
1,Age,9
2,Salary,10
3,Purchased,2


In [22]:
data['Purchased'] = data['Purchased'].eq('Yes').mul(1)
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,0
1,Spain,27.0,48000.0,1
2,Germany,30.0,54000.0,0
3,Spain,38.0,61000.0,0
4,Germany,40.0,63777.777778,1
5,France,35.0,58000.0,1
6,Spain,38.0,52000.0,0
7,France,48.0,79000.0,1
8,Germany,50.0,83000.0,0
9,France,37.0,67000.0,1


**Step 5: Creating a dummy variable**

In [26]:
data_d = pd.get_dummies(data.Country)
data_d

Unnamed: 0,France,Germany,Spain
0,1,0,0
1,0,0,1
2,0,1,0
3,0,0,1
4,0,1,0
5,1,0,0
6,0,0,1
7,1,0,0
8,0,1,0
9,1,0,0


In [31]:
merge = pd.concat([data,data_d],axis='columns')
merge

Unnamed: 0,Country,Age,Salary,Purchased,France,Germany,Spain
0,France,44.0,72000.0,0,1,0,0
1,Spain,27.0,48000.0,1,0,0,1
2,Germany,30.0,54000.0,0,0,1,0
3,Spain,38.0,61000.0,0,0,0,1
4,Germany,40.0,63777.777778,1,0,1,0
5,France,35.0,58000.0,1,1,0,0
6,Spain,38.0,52000.0,0,0,0,1
7,France,48.0,79000.0,1,1,0,0
8,Germany,50.0,83000.0,0,0,1,0
9,France,37.0,67000.0,1,1,0,0


In [32]:
final = merge.drop(['Country'],axis = 'columns')
final

Unnamed: 0,Age,Salary,Purchased,France,Germany,Spain
0,44.0,72000.0,0,1,0,0
1,27.0,48000.0,1,0,0,1
2,30.0,54000.0,0,0,1,0
3,38.0,61000.0,0,0,0,1
4,40.0,63777.777778,1,0,1,0
5,35.0,58000.0,1,1,0,0
6,38.0,52000.0,0,0,0,1
7,48.0,79000.0,1,1,0,0
8,50.0,83000.0,0,0,1,0
9,37.0,67000.0,1,1,0,0


**Step 6: Splitting the datasets into training sets and Test sets**

In [33]:
X = final[['Age', 'Salary','France','Germany','Spain']].values
y = final['Purchased'].values 

In [34]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

**Step 7: Feature Scaling**

In [35]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler() 
scaler.fit(X_train,y_train)
X_train_scaled = scaler.transform(X_train) 
X_test_scaled = scaler.transform(X_test)


In [36]:
X_train_scaled

array([[ 0.27978024,  0.12381479, -1.        ,  2.64575131, -0.77459667],
       [-0.23673712,  0.46175632,  1.        , -0.37796447, -0.77459667],
       [-1.95846165, -1.53093341, -1.        , -0.37796447,  1.29099445],
       [-0.06456467, -1.11141978, -1.        , -0.37796447,  1.29099445],
       [ 1.65715986,  1.7202972 ,  1.        , -0.37796447, -0.77459667],
       [-0.06456467, -0.16751412, -1.        , -0.37796447,  1.29099445],
       [ 0.96847005,  0.98614835,  1.        , -0.37796447, -0.77459667],
       [-0.58108203, -0.48214934,  1.        , -0.37796447, -0.77459667]])

In [37]:
X_test_scaled

array([[-1.44194429, -0.90166297, -1.        ,  2.64575131, -0.77459667],
       [ 2.00150476,  2.13981082, -1.        ,  2.64575131, -0.77459667]])