In [None]:
 from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
import numpy as np
import pandas as pd

In [None]:
df=pd.read_csv('/content/drive/My Drive/Dataset/Data.csv')
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


# DATA PREPROCESSING


## Data Cleaning

Data cleaning is the process of preparing data for analysis by removing or modifying data that is incorrect, incomplete, irrelevant, duplicated, or improperly formatted. This data is usually not necessary or helpful when it comes to analyzing data because it may hinder the process or provide inaccurate results. There are several methods for cleaning data depending on how it is stored along with the answers being sought. Data cleaning is not simply about erasing information to make space for new data, but rather finding a way to maximize a data set’s accuracy without necessarily deleting information. For one, data cleaning includes more actions than removing data, such as fixing spelling and syntax errors, standardizing data sets, and correcting mistakes such as empty fields, missing codes, and identifying duplicate data points. Data cleaning is considered a foundational element of the data science basics, as it plays an important role in the analytical process and uncovering reliable answers.

### Handling Missing Data

Missing data is a deceptively tricky issue in machine learning. We cannot just ignore or remove the missing observation. They must be handled carefully as they can be an indication of something important. The two most common ways to deal with missing data are:


1. Dropping observations with missing values.\
Dropping missing values is sub-optimal because when you drop observations, you drop information.

*   The fact that the value was missing may be informative in itself.
*   Plus, in the real world, you often need to make predictions on new data even if some of the features are missing!



2.  Imputing the missing values from past observations.\
Imputing missing values is sub-optimal because the value was originally missing but you filled it in, which always leads to a loss in information, no matter how sophisticated your imputation method is.-

### A) Removal of Missing data from the dataset 

In [None]:
df1 = df.dropna(how='any',axis=0) 
df1

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
5,France,35.0,58000.0,Yes
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


### B) Replacing the Missing Data(NaN) with Values

In [None]:
from sklearn.impute import SimpleImputer
df2=df
missingvalues = SimpleImputer(missing_values = np.nan, strategy = 'mean', verbose = 0)
missingvalues = missingvalues.fit(df2.iloc[:, 1:3])
df2.iloc[:, 1:3]=missingvalues.transform(df2.iloc[:, 1:3])
df2

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


# Categorical Data

Categorical data is a collection of information that is divided into groups. I.e, if an organisation or agency is trying to get a biodata of its employees, the resulting data is referred to as categorical. This data is called categorical because it may be grouped according to the variables present in the biodata such as sex, state of residence, etc.

Categorical data can take on numerical values (such as “1” indicating Yes and “2” indicating No), but those numbers don’t have mathematical meaning. One can neither add them together nor subtract them from each other. 

In [None]:
from sklearn.preprocessing import LabelEncoder
labelencoder_X = LabelEncoder()
df2['Country'] = labelencoder_X.fit_transform(df2['Country'])
df2

Unnamed: 0,Country,Age,Salary,Purchased
0,0,44.0,72000.0,No
1,2,27.0,48000.0,Yes
2,1,30.0,54000.0,No
3,2,38.0,61000.0,No
4,1,40.0,63777.777778,Yes
5,0,35.0,58000.0,Yes
6,2,38.777778,52000.0,No
7,0,48.0,79000.0,Yes
8,1,50.0,83000.0,No
9,0,37.0,67000.0,Yes


In [None]:
labelencoder_X.classes_

array(['France', 'Germany', 'Spain'], dtype=object)

In [None]:
labelencoder_y = LabelEncoder()
df2['Purchased'] = labelencoder_y.fit_transform(df2['Purchased'])
df2

Unnamed: 0,Country,Age,Salary,Purchased
0,0,44.0,72000.0,0
1,2,27.0,48000.0,1
2,1,30.0,54000.0,0
3,2,38.0,61000.0,0
4,1,40.0,63777.777778,1
5,0,35.0,58000.0,1
6,2,38.777778,52000.0,0
7,0,48.0,79000.0,1
8,1,50.0,83000.0,0
9,0,37.0,67000.0,1


In [None]:
labelencoder_y.classes_

array(['No', 'Yes'], dtype=object)

In [None]:
dff=pd.get_dummies(df2['Country'],prefix='Country')
dff

Unnamed: 0,Country_0,Country_1,Country_2
0,1,0,0
1,0,0,1
2,0,1,0
3,0,0,1
4,0,1,0
5,1,0,0
6,0,0,1
7,1,0,0
8,0,1,0
9,1,0,0


In [None]:
dff=pd.concat([dff, df2['Age'],df2['Salary'],df2['Purchased']], axis=1)
dff

Unnamed: 0,Country_0,Country_1,Country_2,Age,Salary,Purchased
0,1,0,0,44.0,72000.0,0
1,0,0,1,27.0,48000.0,1
2,0,1,0,30.0,54000.0,0
3,0,0,1,38.0,61000.0,0
4,0,1,0,40.0,63777.777778,1
5,1,0,0,35.0,58000.0,1
6,0,0,1,38.777778,52000.0,0
7,1,0,0,48.0,79000.0,1
8,0,1,0,50.0,83000.0,0
9,1,0,0,37.0,67000.0,1


In [None]:
#converting data frame to values
y=df['Purchased']
X=df.drop(['Purchased'],axis=1)

In [None]:
y

0    0
1    1
2    0
3    0
4    1
5    1
6    0
7    1
8    0
9    1
Name: Purchased, dtype: int64

In [None]:
X

Unnamed: 0,Country,Age,Salary
0,0,44.0,72000.0
1,2,27.0,48000.0
2,1,30.0,54000.0
3,2,38.0,61000.0
4,1,40.0,63777.777778
5,0,35.0,58000.0
6,2,38.777778,52000.0
7,0,48.0,79000.0
8,1,50.0,83000.0
9,0,37.0,67000.0


## Feature Scaling

Feature Scaling is a technique to standardize the independent features present in the data in a fixed range. It is performed during the data pre-processing to handle highly varying magnitudes or values or units. If feature scaling is not done, then a machine learning algorithm tends to weigh greater values, higher and consider smaller values as the lower values, regardless of the unit of the values.



* Standardization: It is a very effective technique which re-scales a feature value so that it has distribution with 0 mean value and variance equals to 1.

* MinAbsScaler : Scale each feature by its maximum absolute value.This estimator scales and translates each feature individually such that the maximal absolute value of each feature in the training set will be 1.0. It does not shift/center the data, and thus does not destroy any sparsity.

*  Min-Max Normalization: This technique re-scales a feature or observation value with distribution value between 0 and 1.



#### 1) StandardScaler

In [None]:

from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X = sc_X.fit_transform(X)
X

array([[-1.08347268e+00,  7.58874362e-01,  7.49473254e-01],
       [ 1.32424438e+00, -1.71150388e+00, -1.43817841e+00],
       [ 1.20385853e-01, -1.27555478e+00, -8.91265492e-01],
       [ 1.32424438e+00, -1.13023841e-01, -2.53200424e-01],
       [ 1.20385853e-01,  1.77608893e-01,  6.63219199e-16],
       [-1.08347268e+00, -5.48972942e-01, -5.26656882e-01],
       [ 1.32424438e+00,  0.00000000e+00, -1.07356980e+00],
       [-1.08347268e+00,  1.34013983e+00,  1.38753832e+00],
       [ 1.20385853e-01,  1.63077256e+00,  1.75214693e+00],
       [-1.08347268e+00, -2.58340208e-01,  2.93712492e-01]])

#### 2) MaxAbsScaler

In [None]:
from sklearn.preprocessing import MaxAbsScaler
ma_X = MaxAbsScaler()
print(ma_X.fit_transform(X))

[[-8.18181818e-01  4.43396226e-01  4.27745665e-01]
 [ 1.00000000e+00 -1.00000000e+00 -8.20809249e-01]
 [ 9.09090909e-02 -7.45283019e-01 -5.08670520e-01]
 [ 1.00000000e+00 -6.60377358e-02 -1.44508671e-01]
 [ 9.09090909e-02  1.03773585e-01  3.78518026e-16]
 [-8.18181818e-01 -3.20754717e-01 -3.00578035e-01]
 [ 1.00000000e+00  0.00000000e+00 -6.12716763e-01]
 [-8.18181818e-01  7.83018868e-01  7.91907514e-01]
 [ 9.09090909e-02  9.52830189e-01  1.00000000e+00]
 [-8.18181818e-01 -1.50943396e-01  1.67630058e-01]]


#### 3) MinMaxScaler

In [None]:
from sklearn.preprocessing import MinMaxScaler
mm_X = MinMaxScaler()
print(mm_X.fit_transform(X))

[[0.         0.73913043 0.68571429]
 [1.         0.         0.        ]
 [0.5        0.13043478 0.17142857]
 [1.         0.47826087 0.37142857]
 [0.5        0.56521739 0.45079365]
 [0.         0.34782609 0.28571429]
 [1.         0.51207729 0.11428571]
 [0.         0.91304348 0.88571429]
 [0.5        1.         1.        ]
 [0.         0.43478261 0.54285714]]


## Splitting the dataset



In [None]:
#Splitting of dataset
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [None]:
X_train

array([[ 1.20385853e-01,  1.77608893e-01,  6.63219199e-16],
       [-1.08347268e+00, -2.58340208e-01,  2.93712492e-01],
       [ 1.32424438e+00, -1.71150388e+00, -1.43817841e+00],
       [ 1.32424438e+00,  0.00000000e+00, -1.07356980e+00],
       [-1.08347268e+00,  1.34013983e+00,  1.38753832e+00],
       [ 1.32424438e+00, -1.13023841e-01, -2.53200424e-01],
       [-1.08347268e+00,  7.58874362e-01,  7.49473254e-01],
       [-1.08347268e+00, -5.48972942e-01, -5.26656882e-01]])

In [None]:
X_test

array([[ 0.12038585, -1.27555478, -0.89126549],
       [ 0.12038585,  1.63077256,  1.75214693]])