# DATA PREPROCESSING

## Steps for Preprocessing:
1.Getting Dataset
2.Importing Libraries
3.Importing Datasets
4.Finding Missing values
5.Encoding Categorical values 
6.Splitting dataset into training and testing dataset 
7.Feature scaling 

In [1]:
#IMPORTING LIBRARIES

import numpy as np # numerical data
import pandas as pd #analyzing the data in dataframe and series

##  Step 3 : Importing dataset

In [2]:

data=pd.read_csv("data.csv")
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,74000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [3]:
#Head is used to view data as our convinent
data.head(2)

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes


In [4]:
#shape is used for view the dataset count

data.shape

(10, 4)

In [5]:
#SEPARATE INDEPENDENT VALUES
x=data[['Country','Age','Salary']].values  
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 74000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [6]:
#ANOTHER METHOD TO SEPARATE INDEPENDENT VALUES
#data.iloc[:,:-1].values

In [7]:
#DEPENDENT VALUES
y=data[['Purchased']].values
y

array([['No'],
       ['Yes'],
       ['No'],
       ['No'],
       ['Yes'],
       ['Yes'],
       ['No'],
       ['Yes'],
       ['No'],
       ['Yes']], dtype=object)

###  Step 4 : Finding Missing values

In [8]:
#Find and count the missing values in our dataset
print(data.isnull()) #True means null present

   Country    Age  Salary  Purchased
0    False  False   False      False
1    False  False   False      False
2    False  False   False      False
3    False  False   False      False
4    False  False    True      False
5    False  False   False      False
6    False   True   False      False
7    False  False   False      False
8    False  False   False      False
9    False  False   False      False


In [9]:
print(data.isnull().sum())


Country      0
Age          1
Salary       1
Purchased    0
dtype: int64


In [10]:
#drop missing value records- remaining rows are kept as its place
data.dropna(inplace=True)
data

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
5,France,35.0,58000.0,Yes
7,France,48.0,74000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [11]:
#simple imputer-filled missing values in numerical
from sklearn.impute import SimpleImputer

In [12]:
data2=SimpleImputer(missing_values=np.nan,strategy='mean')

In [13]:
# Fit- we split our data into training and testing adjusts weights according to data values for better accuracy
data2=data2.fit(x[:,1:3])

In [14]:
#transform calculate the mean value
x[:,1:3]=data2.transform(x[:,1:3])
x

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63222.22222222222],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 74000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

###  Step 5:ENCODING CATEGORICAL VALUES Label encoder- each unique values it gives an index like 1,2,3

In [15]:
from sklearn.preprocessing import LabelEncoder

In [16]:
label_encode_x=LabelEncoder()
label_encode_y=LabelEncoder()

In [17]:
x[:,0]=label_encode_x.fit_transform(x[:,0])
x

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63222.22222222222],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 74000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

In [18]:
y=label_encode_y.fit_transform(y)
y

  y = column_or_1d(y, warn=True)


array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

###  Step 6: SPLITTING DATASET INTO TRAINING AND TESTING

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=1)

In [21]:
print(x_train)


[[1 40.0 63222.22222222222]
 [0 44.0 72000.0]
 [2 38.0 61000.0]
 [2 27.0 48000.0]
 [0 48.0 74000.0]
 [1 50.0 83000.0]
 [0 35.0 58000.0]]


In [22]:
print(x_test)


[[1 30.0 54000.0]
 [0 37.0 67000.0]
 [2 38.77777777777778 52000.0]]


In [23]:
print(y_train)


[1 0 0 1 1 0 1]


In [24]:
print(y_test)

[0 1 0]


## Step 7 : FEATURE SCALING


In [25]:
from sklearn.preprocessing import StandardScaler

In [26]:
scaler=StandardScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.fit_transform(x_test)

In [27]:
print(x_train)


[[ 0.17149859 -0.03891021 -0.22152828]
 [-1.02899151  0.50583275  0.59517263]
 [ 1.37198868 -0.31128169 -0.428288  ]
 [ 1.37198868 -1.80932482 -1.63783238]
 [-1.02899151  1.0505757   0.78125639]
 [ 0.17149859  1.32294718  1.61863327]
 [-1.02899151 -0.71983891 -0.70741363]]


In [28]:
print(x_test)

[[ 0.         -1.38802721 -0.55138018]
 [-1.22474487  0.45941746  1.40351318]
 [ 1.22474487  0.92860975 -0.852133  ]]
