# <center> <font color='Darkblue'>Data Preprocessing Implementation<font><center>

# <font color='red'>1.Importing the Libraries<font>

In [90]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# <font color='red'>2.Importing the Dataset<font>

In [92]:
dataset={"States":["Texas","Florida","California","Texas","California","Texas","California","Texas","Florida","Texas"],
         "Age":[27,30,38,None,35,40,48,50,37,38],
         "Salary":[58000,48000,54000,242422,242213,220000,None,234400,200000,340000],
         "Purchased":["No","Yes","No","No","Yes","Yes","No","Yes","No","Yes"]
        }

In [94]:
data=pd.DataFrame(dataset)

In [96]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   States     10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 452.0+ bytes


In [98]:
data.head()

Unnamed: 0,States,Age,Salary,Purchased
0,Texas,27.0,58000.0,No
1,Florida,30.0,48000.0,Yes
2,California,38.0,54000.0,No
3,Texas,,242422.0,No
4,California,35.0,242213.0,Yes


In [100]:
data.tail()

Unnamed: 0,States,Age,Salary,Purchased
5,Texas,40.0,220000.0,Yes
6,California,48.0,,No
7,Texas,50.0,234400.0,Yes
8,Florida,37.0,200000.0,No
9,Texas,38.0,340000.0,Yes


In [102]:
X=data.iloc[:,:-1].values
Y=data.iloc[:,-1].values

In [104]:
X

array([['Texas', 27.0, 58000.0],
       ['Florida', 30.0, 48000.0],
       ['California', 38.0, 54000.0],
       ['Texas', nan, 242422.0],
       ['California', 35.0, 242213.0],
       ['Texas', 40.0, 220000.0],
       ['California', 48.0, nan],
       ['Texas', 50.0, 234400.0],
       ['Florida', 37.0, 200000.0],
       ['Texas', 38.0, 340000.0]], dtype=object)

In [106]:
Y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

# <font color='red'>3.Handling the Missing values<font>

In [108]:
# Handling the Misssing values
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(missing_values=np.nan,strategy='mean')
X[:, 1:3]=imputer.fit_transform(X[:, 1:3])

In [110]:
X

array([['Texas', 27.0, 58000.0],
       ['Florida', 30.0, 48000.0],
       ['California', 38.0, 54000.0],
       ['Texas', 38.111111111111114, 242422.0],
       ['California', 35.0, 242213.0],
       ['Texas', 40.0, 220000.0],
       ['California', 48.0, 182115.0],
       ['Texas', 50.0, 234400.0],
       ['Florida', 37.0, 200000.0],
       ['Texas', 38.0, 340000.0]], dtype=object)

# <font color='red'>4.Categorical Data <font>

In [113]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct=ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[0])],remainder='passthrough')
X=ct.fit_transform(X)

In [115]:
X

array([[0.0, 0.0, 1.0, 27.0, 58000.0],
       [0.0, 1.0, 0.0, 30.0, 48000.0],
       [1.0, 0.0, 0.0, 38.0, 54000.0],
       [0.0, 0.0, 1.0, 38.111111111111114, 242422.0],
       [1.0, 0.0, 0.0, 35.0, 242213.0],
       [0.0, 0.0, 1.0, 40.0, 220000.0],
       [1.0, 0.0, 0.0, 48.0, 182115.0],
       [0.0, 0.0, 1.0, 50.0, 234400.0],
       [0.0, 1.0, 0.0, 37.0, 200000.0],
       [0.0, 0.0, 1.0, 38.0, 340000.0]], dtype=object)

In [117]:
X=X[:,1:]#Avoiding the dummy variable trap

In [119]:
X


array([[0.0, 1.0, 27.0, 58000.0],
       [1.0, 0.0, 30.0, 48000.0],
       [0.0, 0.0, 38.0, 54000.0],
       [0.0, 1.0, 38.111111111111114, 242422.0],
       [0.0, 0.0, 35.0, 242213.0],
       [0.0, 1.0, 40.0, 220000.0],
       [0.0, 0.0, 48.0, 182115.0],
       [0.0, 1.0, 50.0, 234400.0],
       [1.0, 0.0, 37.0, 200000.0],
       [0.0, 1.0, 38.0, 340000.0]], dtype=object)

# <font color='red'>5.Splitting the Datset to train and test data<font>

In [121]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=0)


In [123]:
x_train

array([[0.0, 0.0, 35.0, 242213.0],
       [0.0, 1.0, 38.0, 340000.0],
       [1.0, 0.0, 30.0, 48000.0],
       [0.0, 0.0, 48.0, 182115.0],
       [0.0, 1.0, 50.0, 234400.0],
       [0.0, 1.0, 38.111111111111114, 242422.0],
       [0.0, 1.0, 27.0, 58000.0],
       [0.0, 1.0, 40.0, 220000.0]], dtype=object)

# <font color='red'>6.Feature Scaling<font>

In [125]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
x_train=sc.fit_transform(x_train)
x_test=sc.transform(x_test)

In [127]:
x_train

array([[-0.37796447, -1.29099445, -0.4393767 ,  0.50159976],
       [-0.37796447,  0.77459667, -0.03552407,  1.56055334],
       [ 2.64575131, -1.29099445, -1.1124644 , -1.60156888],
       [-0.37796447, -1.29099445,  1.31065133, -0.14921264],
       [-0.37796447,  0.77459667,  1.57988641,  0.41699133],
       [-0.37796447,  0.77459667, -0.02056657,  0.50386306],
       [-0.37796447,  0.77459667, -1.51631702, -1.49327702],
       [-0.37796447,  0.77459667,  0.23371101,  0.26105106]])

In [131]:
x_test

array([[-0.37796447, -1.29099445, -0.03552407, -1.53659377],
       [ 2.64575131, -1.29099445, -0.17014161,  0.04446734]])