#Data Pre-processing

In [None]:
#Reading and modifying files 
import pandas as pd #import dataset and preprocessing 
import numpy as np #allows us to deal with arrays 
import matplotlib.pyplot as plt # allows us to plots data 


In [None]:
data = pd.read_csv("/content/Data.csv")
data_try=data
data.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [None]:
data_try.dropna(axis=1,inplace=True)
data_try.head(10)

Unnamed: 0,Country,Purchased
0,France,No
1,Spain,Yes
2,Germany,No
3,Spain,No
4,Germany,Yes
5,France,Yes
6,Spain,No
7,France,Yes
8,Germany,No
9,France,Yes


##Selecting columns in the Dataset 

In [None]:
#getting the columns as independent variables 
#data.iloc[...] used to select the colums that is the independent variable in the data set 
# .values is to get array of all the datas in the column 
X = data.iloc[:, :-1].values  
print(X) 

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [None]:
#returns the last columns in the form of the Vector 
Y = data.iloc[:,-1].values
print(Y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


##Taking Care of the Missing data

In [None]:
#simple imputer worksame as fillna 
#bestway to deal with missing data is to replace them with the mean of the columns 

#SimpleImputer deals with the missing data by replacing them with the mean of the columns in the dataset
from sklearn.impute import SimpleImputer

#missing values nan with be replaced with mean , strategy in the data set 
imputer = SimpleImputer(missing_values=np.nan,strategy='mean')

#fill the columns 1-3 
imputer.fit(X[:, 1:3]) #compute the mean of the Each columns 
X[:, 1:3] = imputer.fit_transform(X[:, 1:3]) #transform the data inthe dataset 




In [None]:
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


##Encoding Catagotical Data: Independent variable  



In [None]:
#Transform the String columns in to Numerical data 
from sklearn.compose import ColumnTransformer 

#OneHot Enoder is a encoding Techique of the String type data into the numerical Format
#another is dummy variable encoding 
from sklearn.preprocessing import OneHotEncoder

#ColumnsTransformer will replace the columns
#transformers = list of operations we want to do 
# transformers =['operations','class that perform encodeing', index], othercolums will not be disturbed 
ct =ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[0])],remainder='passthrough')
X = np.array(ct.fit_transform(X)) #numpy array is needed in the ML model , that will be used 

In [None]:
print(X)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


##Encoding Independent variable  

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
Y = le.fit_transform(Y)


In [None]:
print(Y)

[0 1 0 0 1 1 0 1 0 1]


#Label Encoder vs OneHotEncoder 



1.   Label encoderassign the Integer value to the data 
2.   OneHotEncoder represents the data with 0's and 1's 



#Spliting Data into Training Set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test,Y_train,Y_test= train_test_split(X,Y,test_size = 0.2,random_state=5)

In [None]:
print(X_train)

[[0.0 1.0 0.0 30.0 54000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 0.0 1.0 38.0 61000.0]]


In [None]:
print(X_test)

[[1.0 0.0 0.0 37.0 67000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [None]:
print(Y_train)

[0 1 1 1 0 0 0 0]


In [None]:
print(Y_test)

[1 1]


#Feature Scaling : Appling feature scaling only after the Spliting the data, 

#**Standardization**

In [None]:
#Feature Scaling can only be applied to the Gradient desent based Model like Logistic Regression and Nural Networks 
from sklearn.preprocessing import StandardScaler
std = StandardScaler()

#Appling feature scaling only after the Spliting the data, to avoid the testing set so that our model will be the Faster
X_train = std.fit_transform(X_train)
X_test  = std.transform(X_test)


In [None]:
print(X_train)

[[-0.57735027  1.29099445 -0.77459667 -1.25979585 -0.83889983]
 [-0.57735027  1.29099445 -0.77459667  0.0701939  -0.02654016]
 [ 1.73205081 -0.77459667 -0.77459667  1.13418571  1.23815614]
 [-0.57735027 -0.77459667  1.29099445 -1.65879278 -1.33739326]
 [ 1.73205081 -0.77459667 -0.77459667  0.60218981  0.65658047]
 [-0.57735027  1.29099445 -0.77459667  1.40018366  1.5704851 ]
 [-0.57735027 -0.77459667  1.29099445 -0.0923604  -1.00506431]
 [-0.57735027 -0.77459667  1.29099445 -0.19580405 -0.25732416]]


In [None]:
print(X_test)

[[1.0 0.0 0.0 37.0 67000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


#**Normalization**

Eithet Need to Standaedize or Normalize the data for the Processing 

In [None]:
from sklearn.preprocessing import MinMaxScaler
scale = MinMaxScaler()
Y_train = scale.fit_transform(X_train)
print(Y_train)

[[0.         1.         0.         0.13043478 0.17142857]
 [0.         1.         0.         0.56521739 0.45079365]
 [1.         0.         0.         0.91304348 0.88571429]
 [0.         0.         1.         0.         0.        ]
 [1.         0.         0.         0.73913043 0.68571429]
 [0.         1.         0.         1.         1.        ]
 [0.         0.         1.         0.51207729 0.11428571]
 [0.         0.         1.         0.47826087 0.37142857]]
