<a href="https://colab.research.google.com/github/saisiva249/MLLearning/blob/main/data_preprocessing_tools.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preprocessing Tools

## Importing the libraries

In [10]:
#for dealing with array we use numpy
import numpy as np
#to plot the graphs we use matplotlib
import matplotlib.pyplot as plt
# to import the dataset, create the matrix with feature and dependent variable vector we use pandas
import pandas as pd

## Importing the dataset

In [11]:
# features are the columns with which we are going to predict dependent variable
# iloc is nothing but locating using indexes
dataset = pd.read_csv("Data.csv")
#feature columns
x= dataset.iloc[:,:-1].values
#dependent variable vector
y= dataset.iloc[:,-1].values

In [12]:
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [13]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


## Taking care of missing data

In [14]:
#one of the preprocessor tool library we use is scikit-learn
# we can handle missing data using the average, median, and most frequent data
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy = "mean")
#this object of imputer, has a method fit, which has to be applied on the data,
#and the data to be processed has to be passed, which should has only numbers not string
#after fitting the data, our dataset should be transformed
imputer.fit(x[:,1:3])
x[:,1:3] = imputer.transform(x[:,1:3])

In [15]:
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## Encoding categorical data

In [None]:
#categorical data is the column which has string values, which is country, as we cannot directly give
#number to each country, which will make the prediction more dependent on the value assigned to each
#country, so we use a concept of one-hot encoding, and in case if we have more categories in feature
#country, france will de done as [1,0,0] and spain as [0,1,0]
#germany as [0,0,1] means we are creating a 3 different columns for a country column
#if we have the labels in category then we can use labelencoder

### Encoding the Independent Variable

In [17]:
# do column transform with one hot encoding
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[0])],remainder='passthrough')
#as after transform we have to fit and transform, in columntransform we have a single method
# that do both fit and transform
x = np.array(ct.fit_transform(x))

In [18]:
print(x)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


### Encoding the Dependent Variable

In [19]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [20]:
print(y)

[0 1 0 0 1 1 0 1 0 1]


## Splitting the dataset into the Training set and Test set

In [21]:
# we have to apply the feature scaling only after the data is splitted
# as the feature scaling will take the mean of values, so if we apply feature scaling
# before the split it will take the test data also into consideration, which will effect
# out model training with training data, so to avoid the data leakage we do only after data is split

#skilern has train_test_split whcih divides the X_train, X_test, Y_train, Y_test
from sklearn.model_selection import train_test_split
#random state is used in development as if random state is not fixed it will randomly selects
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=1)

In [22]:
print(x_train)

[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [23]:
print(x_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [24]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [25]:
print(y_test)

[0 1]


## Feature Scaling

In [26]:
# feature scaling allows us to put all our features on the same scale
# we should not apply feature scaling to our onehotencoded columns or dummy columns
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train[:,3:] = sc.fit_transform(x_train[:,3:])
x_test[:,3:] = sc.fit_transform(x_test[:,3:])


In [27]:
print(x_train)

[[0.0 0.0 1.0 -0.19159184384578545 -1.0781259408412425]
 [0.0 1.0 0.0 -0.014117293757057777 -0.07013167641635372]
 [1.0 0.0 0.0 0.566708506533324 0.633562432710455]
 [0.0 0.0 1.0 -0.30453019390224867 -0.30786617274297867]
 [0.0 0.0 1.0 -1.9018011447007988 -1.420463615551582]
 [1.0 0.0 0.0 1.1475343068237058 1.232653363453549]
 [0.0 1.0 0.0 1.4379472069688968 1.5749910381638885]
 [1.0 0.0 0.0 -0.7401495441200351 -0.5646194287757332]]


In [28]:
print(x_test)

[[0.0 1.0 0.0 -1.0 -1.0]
 [1.0 0.0 0.0 1.0 1.0]]
