# Data Preparation Methods

In [None]:
# Missing Values
# Importing the libraries
import numpy as np
import pandas as pd

In [None]:
# Read the data:
dataframe = pd.read_csv('/content/horsecolic.csv',header=None,na_values='?')

# Preview:
dataframe.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,2.0,1,530101,38.5,66.0,28.0,3.0,3.0,,2.0,...,45.0,8.4,,,2.0,2,11300,0,0,2
1,1.0,1,534817,39.2,88.0,20.0,,,4.0,1.0,...,50.0,85.0,2.0,2.0,3.0,2,2208,0,0,2
2,2.0,1,530334,38.3,40.0,24.0,1.0,1.0,3.0,1.0,...,33.0,6.7,,,1.0,2,0,0,0,1
3,1.0,9,5290409,39.1,164.0,84.0,4.0,1.0,6.0,2.0,...,48.0,7.2,3.0,5.3,2.0,1,2208,0,0,1
4,2.0,1,530255,37.3,104.0,35.0,,,6.0,2.0,...,74.0,7.4,,,2.0,2,4300,0,0,2


In [None]:
data = dataframe.values
print(data)

[[2.00000e+00 1.00000e+00 5.30101e+05 ... 0.00000e+00 0.00000e+00
  2.00000e+00]
 [1.00000e+00 1.00000e+00 5.34817e+05 ... 0.00000e+00 0.00000e+00
  2.00000e+00]
 [2.00000e+00 1.00000e+00 5.30334e+05 ... 0.00000e+00 0.00000e+00
  1.00000e+00]
 ...
 [1.00000e+00 1.00000e+00 5.29386e+05 ... 0.00000e+00 0.00000e+00
  2.00000e+00]
 [1.00000e+00 1.00000e+00 5.30612e+05 ... 0.00000e+00 0.00000e+00
  1.00000e+00]
 [1.00000e+00 1.00000e+00 5.34618e+05 ... 0.00000e+00 0.00000e+00
  2.00000e+00]]


In [None]:
# Splitting the data into Input and Output variables
ix = [i for i in range(data.shape[1]) if i != 23]
X, y = data[:, ix], data[:, 23]
# Print the missing values
print(sum(np.isnan(X).flatten()))

1605


## Simple Imputer

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
imputer.fit(X)
Xtrans = imputer.transform(X)
print(sum(np.isnan(Xtrans).flatten()))

0


In [None]:
print(Xtrans)

[[2.00000e+00 1.00000e+00 5.30101e+05 ... 0.00000e+00 0.00000e+00
  2.00000e+00]
 [1.00000e+00 1.00000e+00 5.34817e+05 ... 0.00000e+00 0.00000e+00
  2.00000e+00]
 [2.00000e+00 1.00000e+00 5.30334e+05 ... 0.00000e+00 0.00000e+00
  1.00000e+00]
 ...
 [1.00000e+00 1.00000e+00 5.29386e+05 ... 0.00000e+00 0.00000e+00
  2.00000e+00]
 [1.00000e+00 1.00000e+00 5.30612e+05 ... 0.00000e+00 0.00000e+00
  1.00000e+00]
 [1.00000e+00 1.00000e+00 5.34618e+05 ... 0.00000e+00 0.00000e+00
  2.00000e+00]]


## Recursive Feature Elimination

In [None]:
from sklearn.datasets import make_classification
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier
# Dataset
X, y = make_classification(n_samples=1000,n_features=10,n_informative=5,n_redundant=5,random_state=22)
# RFE
rfe = RFE(estimator=DecisionTreeClassifier(),n_features_to_select=5)
# Fit the RFE
rfe.fit(X,y)
# Summarize all the features
for i in range(X.shape[1]):
  print(f"Column: {i}, Selected: {rfe.support_[i]}, Rank: {rfe.ranking_[i]}")

Column: 0, Selected: True, Rank: 1
Column: 1, Selected: True, Rank: 1
Column: 2, Selected: False, Rank: 4
Column: 3, Selected: False, Rank: 5
Column: 4, Selected: False, Rank: 6
Column: 5, Selected: False, Rank: 3
Column: 6, Selected: True, Rank: 1
Column: 7, Selected: False, Rank: 2
Column: 8, Selected: True, Rank: 1
Column: 9, Selected: True, Rank: 1


## Normalization


In [None]:
from sklearn.datasets import make_classification
from sklearn.preprocessing import MinMaxScaler
# Dataset
X, y = make_classification(n_samples=1000,n_features=10,n_informative=5,n_redundant=5,random_state=22)
# Preview of the data before transformation
print(X[:3,:])
print(f"Minimum Value: {np.min(X)}")
print(f"Maximum Value: {np.max(X)}")

[[-2.87309797  0.5276052  -3.52488704  3.49606085  0.08363016 -1.43624509
  -2.42444429  1.26384926 -0.21877207  0.74197021]
 [ 2.66585573  0.21306628  0.94458444  0.77189566 -1.00515254 -0.4778104
  -1.49743323  1.86377788 -0.88614443 -0.77788933]
 [-2.56661563 -0.46985872  2.03732313 -2.77017709  4.18632811  3.09108833
  -0.78878964 -0.73625653  0.37879393  1.94943241]]
Minimum Value: -9.486723344958765
Maximum Value: 11.046877437158985


In [None]:
# After Normalising the values
transformation = MinMaxScaler()
X_norm = transformation.fit_transform(X)
# Preview after the Transformation
print(X_norm[:3,:])
print(f"Minimum Value: {np.min(X_norm)}")
print(f"Maximum Value: {np.max(X_norm)}")

[[0.32208795 0.63655227 0.11507324 0.85164406 0.49596434 0.33245595
  0.3661656  0.61768649 0.50640415 0.52536847]
 [0.59183867 0.60775031 0.50434422 0.6195653  0.39965515 0.45000538
  0.43680037 0.68023042 0.44555056 0.4326562 ]
 [0.33701384 0.54521565 0.59951688 0.31780685 0.85887191 0.88772115
  0.49079635 0.40917086 0.56089253 0.59902434]]
Minimum Value: 0.0
Maximum Value: 1.0


## Encoding the Categorical Features

In [None]:
from sklearn.preprocessing import OneHotEncoder
dataset = pd.read_csv('https://raw.githubusercontent.com/jbrownlee/Datasets/master/breast-cancer.csv',header = None)
data = dataset.values
# Separate the dataset into dependant and independant variables
X = data[:,:-1].astype(str)
y = data[:,-1].astype(str)
# Summarize the raw data
print(X[:3,:])
# Encode data with One Hot Encoding
encoder = OneHotEncoder(sparse=False)
X_encod = encoder.fit_transform(X)
# Summarize the transformed data
print(X_encod[:3,:])

[["'40-49'" "'premeno'" "'15-19'" "'0-2'" "'yes'" "'3'" "'right'"
  "'left_up'" "'no'"]
 ["'50-59'" "'ge40'" "'15-19'" "'0-2'" "'no'" "'1'" "'right'" "'central'"
  "'no'"]
 ["'50-59'" "'ge40'" "'35-39'" "'0-2'" "'no'" "'2'" "'left'" "'left_low'"
  "'no'"]]
[[0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
  0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.
  0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0.
  0. 0. 0. 1. 0. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0. 0. 0. 1. 0.]]


## Encoding the Numerical Features ( KBins )

In [None]:
# Transoform numerical features into categories and encoding them
from sklearn.datasets import make_classification
from sklearn.preprocessing import KBinsDiscretizer
X, y = make_classification(n_samples=1000,n_features=5,n_informative=5,n_redundant=0,random_state=22)
# Summarize before transform
print(X[:3,:])
# Transforming the data
transform = KBinsDiscretizer(n_bins=10,encode='ordinal',strategy='uniform')
X_discretize = transform.fit_transform(X)
# Summarize the Transformed data
print(X_discretize[:3,:])

[[-1.08110395 -2.01407779  0.61952687  1.82477824 -0.04349306]
 [ 3.64887281  3.50137103 -4.16840304  1.59734457  1.54611466]
 [-0.95698931 -3.49222807  1.35015752  1.9568312  -0.63787709]]
[[3. 3. 6. 6. 4.]
 [8. 8. 2. 6. 5.]
 [3. 2. 7. 6. 3.]]


## Dimensionality Reduction

In [None]:
from sklearn.decomposition import PCA
# define the dataset
X, y = make_classification(n_samples=1000,n_features=10,n_informative=7,n_redundant=3,random_state=22)
# Summarize the data before transform
print(X[:3,:])
# Define the Transform
transform = PCA(n_components=4)
# Fit the transformation
X_dim = transform.fit_transform(X)
# Summarize the data after Transformation
print(X_dim[:3,:])

[[ 2.03713682  3.99256212 -2.65363111  2.61231706 -1.54012427  0.1737487
   0.03513394 -0.37521819  1.9277916   3.33807757]
 [-0.55424178 -2.66923187 -1.05914652 -3.66304042  5.66259175  0.71169513
  -1.32156386 -3.62398943 -2.00158266 -0.45255225]
 [ 0.84775859  2.36974653 -1.66575563  3.25876278 -2.66704     0.7307867
   2.26278966  2.13423294  0.97902913  0.95832185]]
[[ 3.02398757  3.91436325  4.25154504  0.13826606]
 [-6.71178478 -4.52254967  3.76427906 -2.08852648]
 [ 3.61223314  1.8709214   1.85206707 -0.1697265 ]]
