In [244]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

In [245]:
data = pd.read_csv("Data.csv")
data.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [246]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [247]:
print(X)

   Country   Age   Salary
0   France  44.0  72000.0
1    Spain  27.0  48000.0
2  Germany  30.0  54000.0
3    Spain  38.0  61000.0
4  Germany  40.0      NaN
5   France  35.0  58000.0
6    Spain   NaN  52000.0
7   France  48.0  79000.0
8  Germany  50.0  83000.0
9   France  37.0  67000.0


In [248]:
print(y)

0     No
1    Yes
2     No
3     No
4    Yes
5    Yes
6     No
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object


<h4>What is Imputer?</h4>
If you have some missing values in your dataset, you can drop the missing values row or even column.
This method is highly discouraged as it reduces the size of data, and the data analysis can be skewed from the ground truth.
Instead, we should use machine learning algorithms that are not affected by missing values or use imputers to fill in the missing information.

In [249]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X.iloc[:, 1:3] = imputer.fit_transform(X.iloc[:, 1:3])

In [250]:
X.head()

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,63777.777778


<h3>Encoding categorical data</h3>
the process of converting categorical or textual data into numerical format, so that it can be used as input for algorithms to process. 
for better understand the categorical data and encoding that read this <a href="https://medium.com/aiskunks/categorical-data-encoding-techniques-d6296697a40f">site</a>.

<h4>Encoding Independent variables</h4>
If K is the number of categories in an independent variable, then one-hot encoding transforms each category into a dummy variable for which 0 indicates absence and 1 indicates presence of one of the categories. Thus, one-hot encoding produces a collection of K binary independent variables.


In [251]:
#first column needs the one hot encoding so let's do that 
from sklearn.preprocessing import LabelEncoder
df = X.copy()
one_hot = pd.get_dummies(df['Country'])
df = pd.concat([df, one_hot], axis=1)
df = df.drop(["Country"], axis=1)
df.head()

Unnamed: 0,Age,Salary,France,Germany,Spain
0,44.0,72000.0,True,False,False
1,27.0,48000.0,False,False,True
2,30.0,54000.0,False,True,False
3,38.0,61000.0,False,False,True
4,40.0,63777.777778,False,True,False


In [252]:
columns = ["France", "Germany", "Spain"]
encoder = LabelEncoder()
for column in columns:
    df[column] = pd.DataFrame(encoder.fit_transform(df[column]))
df.head()

Unnamed: 0,Age,Salary,France,Germany,Spain
0,44.0,72000.0,1,0,0
1,27.0,48000.0,0,0,1
2,30.0,54000.0,0,1,0
3,38.0,61000.0,0,0,1
4,40.0,63777.777778,0,1,0


In [253]:
# Or do that 
columns = ["Country"]
from sklearn.preprocessing import OneHotEncoder
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
tmp = pd.DataFrame(OH_encoder.fit_transform(X[columns]))
tmp.index = X.index
df = X.drop("Country", axis=1)
df = pd.concat([df, tmp], axis=1)

<h4>Why don't names of columns same in two dataFrame? </h4>
think about that.

In [254]:
df.head()

Unnamed: 0,Age,Salary,0,1,2
0,44.0,72000.0,1.0,0.0,0.0
1,27.0,48000.0,0.0,0.0,1.0
2,30.0,54000.0,0.0,1.0,0.0
3,38.0,61000.0,0.0,0.0,1.0
4,40.0,63777.777778,0.0,1.0,0.0


<h4>Encoding dependent variable</h4>
befor you saw how 'lableEncoder' work. that is encoding dependent variable and so easy you can to that.

In [255]:
y = pd.DataFrame(encoder.fit_transform(y))
y["Purchased"] = y[0]

In [256]:
y = y.drop(0, axis=1)
y.head()

Unnamed: 0,Purchased
0,0
1,1
2,0
3,0
4,1


<h3>Splitting the data to train, test, cross-validation</h3>

In [257]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=1)

In [258]:
X_train.head()

Unnamed: 0,Age,Salary,0,1,2
6,38.777778,52000.0,0.0,0.0,1.0
4,40.0,63777.777778,0.0,1.0,0.0
0,44.0,72000.0,1.0,0.0,0.0
3,38.0,61000.0,0.0,0.0,1.0
1,27.0,48000.0,0.0,0.0,1.0


In [259]:
X_test

Unnamed: 0,Age,Salary,0,1,2
2,30.0,54000.0,0.0,1.0,0.0
9,37.0,67000.0,1.0,0.0,0.0


<h3>Feature Scaling</h3>
Feature scaling is a vital pre processing step in machine learning that involves transforming numerical features to a common scale.see this <a href="https://medium.com/@shivanipickl/what-is-feature-scaling-and-why-does-machine-learning-need-it-104eedebb1c9#:~:text=Feature%20scaling%20is%20a%20vital,efficient%20model%20training%20and%20performance.">site</a>

In [260]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_train.iloc[:, :2] = ss.fit_transform(X_train.iloc[:, :2])
X_test.iloc[:, :2] = ss.transform(X_test.iloc[:, :2])

In [261]:
X_train.head()

Unnamed: 0,Age,Salary,0,1,2
6,-0.191592,-1.078126,0.0,0.0,1.0
4,-0.014117,-0.070132,0.0,1.0,0.0
0,0.566709,0.633562,1.0,0.0,0.0
3,-0.30453,-0.307866,0.0,0.0,1.0
1,-1.901801,-1.420464,0.0,0.0,1.0


In [262]:
X_test

Unnamed: 0,Age,Salary,0,1,2
2,-1.466182,-0.906957,0.0,1.0,0.0
9,-0.449737,0.20564,1.0,0.0,0.0
