# Day 1 - Data Preprocessing

1. Import libraries
2. Import the dataset
3. Handling the missing Data
4. Encoding Categorical Data
5. Splitting Dataset into train and test set
6. Feature scaling

### 1. Import libraries

In [1]:
import pandas as pd
import numpy as np

### 2. Import the dataset

In [2]:
df_data = pd.read_csv('datasets/Data.csv')
df_data.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


#### Check if there are any missing values

In [3]:
df_data.isnull().values.any()

True

#### Find the columns that have missing values

In [4]:
df_data.columns[df_data.isna().any()].tolist()

['Age', 'Salary']

#### Create the dataset to Features and Labels

In [5]:
X = df_data.iloc[:, :-1].values
Y = df_data.iloc[:, 3].values

### 3. Handling the missing values

In [6]:
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(X[:, 1:3])

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='mean', verbose=0)

In [7]:
X[:, 1:3] = imp_mean.transform(X[:, 1:3])

### 4. Encoding categorical variables

In [8]:
from sklearn.preprocessing import LabelEncoder

# Create the Label encoder for the country
labelencoder_X = LabelEncoder()
X[ : , 0] = labelencoder_X.fit_transform(X[ : , 0])
labelencoder_Y = LabelEncoder()
Y =  labelencoder_Y.fit_transform(Y)

### 5. Split dataset to train and test

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=22)

### 6. Feature Scaling

In [10]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)