## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Importing dataset

In [2]:
df = pd.read_csv('datasets/ShopSellData.csv')

In [3]:
df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [4]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


## Replacing NAN values

In [5]:
## Fill NAN values of salary column
df['Salary'].fillna(df['Salary'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Salary'].fillna(df['Salary'].mean(), inplace=True)


In [6]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [7]:
## Filling NAN values for age column
df['Age'].fillna(df['Age'].median(),inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(),inplace=True)


In [8]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


## Encoding categorical variable

In [9]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['Purchased'] = label_encoder.fit_transform(df['Purchased'])

df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,0
1,Spain,27.0,48000.0,1
2,Germany,30.0,54000.0,0
3,Spain,38.0,61000.0,0
4,Germany,40.0,63777.777778,1
5,France,35.0,58000.0,1
6,Spain,38.0,52000.0,0
7,France,48.0,79000.0,1
8,Germany,50.0,83000.0,0
9,France,37.0,67000.0,1


## Encoding Country Column

In [10]:
from sklearn.preprocessing import OneHotEncoder
one_hot_encoder = OneHotEncoder()
one_hot_encoder.fit_transform(df.Country.values.reshape(-1,1)).toarray()

array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])

## Label Encoding Country Column 

In [11]:
label_encoder_2 = LabelEncoder()
df.iloc[:,0] = label_encoder_2.fit_transform(df.iloc[:,0])

In [12]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,0,44.0,72000.0,0
1,2,27.0,48000.0,1
2,1,30.0,54000.0,0
3,2,38.0,61000.0,0
4,1,40.0,63777.777778,1
5,0,35.0,58000.0,1
6,2,38.0,52000.0,0
7,0,48.0,79000.0,1
8,1,50.0,83000.0,0
9,0,37.0,67000.0,1


## Splitting the dataset in Features X and Label Y

In [13]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [14]:
X

Unnamed: 0,Country,Age,Salary
0,0,44.0,72000.0
1,2,27.0,48000.0
2,1,30.0,54000.0
3,2,38.0,61000.0
4,1,40.0,63777.777778
5,0,35.0,58000.0
6,2,38.0,52000.0
7,0,48.0,79000.0
8,1,50.0,83000.0
9,0,37.0,67000.0


In [15]:
y

0    0
1    1
2    0
3    0
4    1
5    1
6    0
7    1
8    0
9    1
Name: Purchased, dtype: int64

## Splitting train and test set 

In [16]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=0,test_size=0.2)

In [17]:
X_train,X_test,y_train,y_test

(  Country   Age        Salary
 4       1  40.0  63777.777778
 9       0  37.0  67000.000000
 1       2  27.0  48000.000000
 6       2  38.0  52000.000000
 7       0  48.0  79000.000000
 3       2  38.0  61000.000000
 0       0  44.0  72000.000000
 5       0  35.0  58000.000000,
   Country   Age   Salary
 2       1  30.0  54000.0
 8       1  50.0  83000.0,
 4    1
 9    1
 1    1
 6    0
 7    1
 3    0
 0    0
 5    1
 Name: Purchased, dtype: int64,
 2    0
 8    0
 Name: Purchased, dtype: int64)

## Standard Scalar 

In [20]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [22]:
X_train

array([[ 0.13483997,  0.27978024,  0.12381479],
       [-0.94387981, -0.23673712,  0.46175632],
       [ 1.21355975, -1.95846165, -1.53093341],
       [ 1.21355975, -0.06456467, -1.11141978],
       [-0.94387981,  1.65715986,  1.7202972 ],
       [ 1.21355975, -0.06456467, -0.16751412],
       [-0.94387981,  0.96847005,  0.98614835],
       [-0.94387981, -0.58108203, -0.48214934]])

In [23]:
X_test

array([[ 0.13483997, -1.44194429, -0.90166297],
       [ 0.13483997,  2.00150476,  2.13981082]])