### this is a Demo file to learn how to import a dataset in  *CSV* format for ML ###

### importing libaries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

print("libraries imported")

libraries imported


### importing the dataset

In [2]:
dataset = pd.read_csv('Data.csv')
x = dataset.iloc[: , :-1].values  #indexes all the rows ':' & all columns except last':-1'
y = dataset.iloc[:,-1].values #indexes for all rows of last column

print(x)
print('y = ',y)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]
y =  ['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


### removing empty cells

In [3]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# replacing the null values.
imputer.fit(x[:,1:3])
x[:,1:3] = imputer.transform(x[:,1:3])
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## Encoding categorical data
- the process of encoding categorical data(usually names) into numerical data for individual categories.
- numerical data is more suitable for Machine learning training models.
- It is done so that the machine can identify the possible re-occurring name(A-Z chars.) and can be classified into categories.

*One Hot Encoding*
- used for datasets containing **more than 2** unique values.
- a technique that we use to represent categorical variables as numerical values in a machine-learning model.
- the categorical data is divided into multiple columns of 1's and 0's where unique combinations in rows represent the unique category.
- Non encoded data can lead to increased dimensionality, as a separate column is created for each category in the variable.
- make the model more complex and slow to train.

### Encoding the Independent variable

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

#object                 |transformers=(operation, Methord of OP, columns)|     |remainder= columns not to be transformed|
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])],   remainder='passthrough')

# fit_transform dosen't return an array but our ML model needs an array to train.
x = np.array(ct.fit_transform(x))

print(x)
# here the country names(in char) has been transformed to 3 column repesented by  combiantions of 0 & 1(float).

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


### Encoding Dependent variables
*Label Encoding*
- Used for dataset containg **2 category** that can be represented as 0 or 1.
-  process of encoding categorical data(ususally names) into numerical data for individual category

In [5]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
print(y)

# here (yes/no) from 'y' has been transformed to 0's & 1's.

[0 1 0 0 1 1 0 1 0 1]


### Splitting the dataset into Training set


In [6]:
from sklearn.model_selection import train_test_split

#   x_train -> Independent dataset from training set
#   x_test -> Indpendent dataset from test set

#   y_train -> Dependent dataset from training set
#   y_test -> Dependent dataset from test set
#    
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)
#                               train_test_split(independent vector, dependent vector, scale , state)
# random_state selects random rows from the dataset
print("x_test =>\n",x_test)
print("y_test =>\n",y_test)

print("x_train =>\n",x_train)
print("y_train =>\n",y_train)

x_test =>
 [[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]
y_test =>
 [0 1]
x_train =>
 [[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]
y_train =>
 [0 1 0 0 1 1 0 1]


### Feature scaling
- Feature Scaling is a technique to standardize the independent features(variables) present in the data in a fixed range. 
- performed to handle highly varying magnitudes or values or units.

![alt text](./feature_scaling.jpg)

- standardisation works all the time, Normalisation works well with dataset that have normal distribution.

*matrix of feature*
- a term used in machine learning to describe the list of columns that contain independent variables to be processed, including all lines in the dataset. in his code, we have 2 matrix of feature, they are **x_train & x_test**
- Feature scaling is not applied to whole dataset, It is appllied to **training set & test set** seperatly.
- **The Scaler** is fitted only to **x_train**.
---
- we are **not allowed** to fit our Feature scaling on the **test set** with **traing set** because we would get **mean & standard deviation** in the features(variables values) and lead to leakege of data.

- ![training and test set](./train_test_set.png)

- we are **not allowed** to fit feature scaling on **dumy variables**(Categories that have been transformed using One Hot Encoding) as it wound disrupt the Encoding of data.
---




In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer

st = StandardScaler()
nr = Normalizer()

# fit methord calculates, Transform methord replaces.

# applying Feature scaling to Independent variables only.

# Feature scaling for Training set.
# First 3 colls are Dummy variables(One Hot Encoded), excluded

# object for Normalization methord ---> nr
t_x_train = nr.fit_transform(x_train[:,3:])

for i in range(len(t_x_train)):
    print(x_train[i][3:] ,"  --->  ", t_x_train[i])

[38.77777777777778 52000.0]   --->   [7.45726288e-04 9.99999722e-01]
[40.0 63777.77777777778]   --->   [6.27177577e-04 9.99999803e-01]
[44.0 72000.0]   --->   [6.11110997e-04 9.99999813e-01]
[38.0 61000.0]   --->   [6.22950699e-04 9.99999806e-01]
[27.0 48000.0]   --->   [5.62499911e-04 9.99999842e-01]
[48.0 79000.0]   --->   [6.07594825e-04 9.99999815e-01]
[50.0 83000.0]   --->   [6.02409529e-04 9.99999819e-01]
[35.0 58000.0]   --->   [6.03448166e-04 9.99999818e-01]


In [8]:
# object for Standardization methord --> st
t_x_train = st.fit_transform(x_train[:,3:])

for i in range(len(t_x_train)):
    print(x_train[i][3:] ,"  --->  ", t_x_train[i])

[38.77777777777778 52000.0]   --->   [-0.19159184 -1.07812594]
[40.0 63777.77777777778]   --->   [-0.01411729 -0.07013168]
[44.0 72000.0]   --->   [0.56670851 0.63356243]
[38.0 61000.0]   --->   [-0.30453019 -0.30786617]
[27.0 48000.0]   --->   [-1.90180114 -1.42046362]
[48.0 79000.0]   --->   [1.14753431 1.23265336]
[50.0 83000.0]   --->   [1.43794721 1.57499104]
[35.0 58000.0]   --->   [-0.74014954 -0.56461943]


In [9]:
# feature scaling for test set.
x_test[:,3:] = st.transform(x_test[:, 3:])
print(x_test)

[[0.0 1.0 0.0 -1.4661817944830124 -0.9069571034860727]
 [1.0 0.0 0.0 -0.44973664397484414 0.2056403393225306]]
