#Preprocesamiento y escalado

In [16]:
import pandas as pd
import numpy as np
 
arr = np.array([['M', 81.4, 82.2, 44, 6.1, 120000, 'no'],
               ['M', 75.2, 86.2, 40, 5.9, 80000, 'no'],
               ['F', 80.0, 83.2, 34, 5.4, 210000, 'yes'],
               ['F', 85.4, 72.2, 46, 5.6, 50000, 'yes'],
               ['M', 68.4, 87.2, 28, 5.11, 70000, 'no']])
#
# Create Pandas DataFrame
#
df = pd.DataFrame(arr)
df.columns = ['gender', 'hsc_p', 'ssc_p', 'age', 'height', 'salary', 'suffer_from_disease']
#
# Convert the string data type to int and float appropriately
#
df[['age', 'salary']] = df[['age', 'salary']].astype(int)
df[['ssc_p', 'hsc_p', 'height']] = df[['ssc_p', 'hsc_p', 'height']].astype(float)

df.head()


Unnamed: 0,gender,hsc_p,ssc_p,age,height,salary,suffer_from_disease
0,M,81.4,82.2,44,6.1,120000,no
1,M,75.2,86.2,40,5.9,80000,no
2,F,80.0,83.2,34,5.4,210000,yes
3,F,85.4,72.2,46,5.6,50000,yes
4,M,68.4,87.2,28,5.11,70000,no


##Normalización

In [8]:
def normalize(values):
    return (values - values.min())/(values.max() - values.min()) 

cols = ['hsc_p', 'ssc_p', 'age', 'height', 'salary']
#
# Normalize the feature columns
#
df[cols]= df[cols].apply(normalize)

df.head()


Unnamed: 0,gender,hsc_p,ssc_p,age,height,salary,suffer_from_disease
0,M,0.764706,0.666667,0.888889,1.0,0.4375,no
1,M,0.4,0.933333,0.666667,0.79798,0.1875,no
2,F,0.682353,0.733333,0.333333,0.292929,1.0,yes
3,F,1.0,0.0,1.0,0.494949,0.0,yes
4,M,0.0,1.0,0.0,0.0,0.125,no


In [13]:
#Utilizando la clase MinMaxScaler 
from sklearn.preprocessing import MinMaxScaler
 
mmscaler = MinMaxScaler()
cols = ['hsc_p', 'ssc_p', 'age', 'height', 'salary']
df[cols]= mmscaler.fit_transform(df[cols])

df.head()

Unnamed: 0,gender,hsc_p,ssc_p,age,height,salary,suffer_from_disease
0,M,0.764706,0.666667,0.888889,1.0,0.4375,no
1,M,0.4,0.933333,0.666667,0.79798,0.1875,no
2,F,0.682353,0.733333,0.333333,0.292929,1.0,yes
3,F,1.0,0.0,1.0,0.494949,0.0,yes
4,M,0.0,1.0,0.0,0.0,0.125,no


Ejemplo de normalización en datos de prueba y entrenamiento

In [15]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
 
iris = datasets.load_iris()
X = iris.data
y = iris.target
 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)
 
mmscaler = MinMaxScaler()
X_train_norm = mmscaler.fit_transform(X_train)
X_test_norm = mmscaler.transform(X_test)

##Estandarización

In [11]:
def standardize(values):
    return (values - values.mean())/values.std()

cols = ['hsc_p', 'ssc_p', 'age', 'height', 'salary']
#
# Standardize the feature columns; Dataframe needs to be recreated for the following command to work properly.
#
df[cols]= df[cols].apply(standardize)

df.head()

Unnamed: 0,gender,hsc_p,ssc_p,age,height,salary,suffer_from_disease
0,M,0.508834,2.2204460000000003e-17,0.756481,1.216399,0.220534,no
1,M,-0.441398,0.6713451,0.216137,0.707446,-0.409563,no
2,F,0.294265,0.1678363,-0.594378,-0.564939,1.638252,yes
3,F,1.121887,-1.678363,1.026652,-0.055985,-0.882136,yes
4,M,-1.483588,0.8391814,-1.404892,-1.302921,-0.567087,no


In [17]:
#Utilizando la clase StandardScaler
from sklearn.preprocessing import StandardScaler
 
sc = StandardScaler()
cols = ['hsc_p', 'ssc_p', 'age', 'height', 'salary']
df[cols]= sc.fit_transform(df[cols])

df.head()

Unnamed: 0,gender,hsc_p,ssc_p,age,height,salary,suffer_from_disease
0,M,0.568894,0.0,0.845771,1.359976,0.246564,no
1,M,-0.493498,0.750587,0.241649,0.790948,-0.457905,no
2,F,0.328999,0.187647,-0.664534,-0.63162,1.831622,yes
3,F,1.254308,-1.876467,1.147832,-0.062593,-0.986258,yes
4,M,-1.658702,0.938233,-1.570717,-1.45671,-0.634023,no


Estandarizando los subconjuntos de entrenamiento y prueba

In [19]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
 
iris = datasets.load_iris()
X = iris.data
y = iris.target
 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)
 
sc = StandardScaler()
X_train_norm = sc.fit_transform(X_train)
X_test_norm = sc.transform(X_test)