In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing

In [2]:
raw_data={'year':['1986','2003','2007','2009','2010','2012','2016','2018'],
         'state':['born','Btech','Infosys','CAT','Symbi','Tally','Sapient','AI'],
         'income':[0,0,25,0,0,75,120,145],
         'age':[0,17,21,23,24,26,30,32]}

In [3]:
df=pd.DataFrame(raw_data,columns=['year','state','income','age'])

In [4]:
df

Unnamed: 0,year,state,income,age
0,1986,born,0,0
1,2003,Btech,0,17
2,2007,Infosys,25,21
3,2009,CAT,0,23
4,2010,Symbi,0,24
5,2012,Tally,75,26
6,2016,Sapient,120,30
7,2018,AI,145,32


In [5]:
#fit the label encoder
le=preprocessing.LabelEncoder()

In [6]:
#fit the label encoder to the df column
le.fit(df['state'])

LabelEncoder()

In [7]:
#view the labels
list(le.classes_)

['AI', 'Btech', 'CAT', 'Infosys', 'Sapient', 'Symbi', 'Tally', 'born']

In [8]:
#transform catgories into integers
df['state_label']=le.transform(df['state'])

In [9]:
df

Unnamed: 0,year,state,income,age,state_label
0,1986,born,0,0,7
1,2003,Btech,0,17,1
2,2007,Infosys,25,21,3
3,2009,CAT,0,23,2
4,2010,Symbi,0,24,5
5,2012,Tally,75,26,6
6,2016,Sapient,120,30,4
7,2018,AI,145,32,0


In [10]:
zip(df['state_label'],df['state'])

[(7, 'born'),
 (1, 'Btech'),
 (3, 'Infosys'),
 (2, 'CAT'),
 (5, 'Symbi'),
 (6, 'Tally'),
 (4, 'Sapient'),
 (0, 'AI')]

In [11]:
#transform integers into categories
le.inverse_transform(df['state_label'])

array(['born', 'Btech', 'Infosys', 'CAT', 'Symbi', 'Tally', 'Sapient', 'AI'], dtype=object)

### Delete observations with missing values

In [12]:
raw_data={'year':['1986','2003','2007','2009','2010','2012','2016','2018'],
         'state':['born','Btech','Infosys','CAT','Symbi','Tally','Sapient','AI'],
         'income':[0,0,25,np.nan,0,75,120,145],
         'age':[0,17,21,23,24,26,30,32]}

In [13]:
df=pd.DataFrame(raw_data,columns=['year','state','income','age'])

In [14]:
df

Unnamed: 0,year,state,income,age
0,1986,born,0.0,0
1,2003,Btech,0.0,17
2,2007,Infosys,25.0,21
3,2009,CAT,,23
4,2010,Symbi,0.0,24
5,2012,Tally,75.0,26
6,2016,Sapient,120.0,30
7,2018,AI,145.0,32


In [15]:
df.dropna()

Unnamed: 0,year,state,income,age
0,1986,born,0.0,0
1,2003,Btech,0.0,17
2,2007,Infosys,25.0,21
4,2010,Symbi,0.0,24
5,2012,Tally,75.0,26
6,2016,Sapient,120.0,30
7,2018,AI,145.0,32


### Detect Outliers

In [16]:
from sklearn.covariance import EllipticEnvelope
from sklearn.datasets import make_blobs

In [17]:
# Create simulated data
X, _ = make_blobs(n_samples = 10,
                  n_features = 2,
                  centers = 1,
                  random_state = 1)

# Replace the first observation's values with extreme values
X[0,0] = 10000
X[0,1] = 10000

EllipticEnvelope assumes the data is normally distributed and based on that assumption “draws” an ellipse around the data, classifying any observation inside the ellipse as an inlier (labeled as 1) and any observation outside the ellipse as an outlier (labeled as -1). A major limitation of this approach is the need to specify a contamination parameter which is the proportion of observations that are outliers, a value that we don’t know

In [18]:
# Create detector
outlier_detector = EllipticEnvelope(contamination=.1)

# Fit detector
outlier_detector.fit(X)

# Predict outliers
outlier_detector.predict(X)

array([-1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

### Discretize Features


In [19]:
from sklearn.preprocessing import Binarizer

In [20]:
df['bins']=np.digitize(df['age'], bins=[20,30,64])

In [21]:
df

Unnamed: 0,year,state,income,age,bins
0,1986,born,0.0,0,0
1,2003,Btech,0.0,17,0
2,2007,Infosys,25.0,21,1
3,2009,CAT,,23,1
4,2010,Symbi,0.0,24,1
5,2012,Tally,75.0,26,1
6,2016,Sapient,120.0,30,2
7,2018,AI,145.0,32,2


### Encoding Ordinal Categorical features

In [22]:
priority=pd.DataFrame({'state':['low','medium','high','low','medium','high']})

In [23]:
priority

Unnamed: 0,state
0,low
1,medium
2,high
3,low
4,medium
5,high


In [24]:
# create scale map
scale_mapper={'low':1,
             'medium':2,
             'high':3}

In [25]:
#map scale to features
priority['scale']=priority.state.replace(scale_mapper)

In [26]:
priority

Unnamed: 0,state,scale
0,low,1
1,medium,2
2,high,3
3,low,1
4,medium,2
5,high,3


### Outlier Treatment

Delete
Mark
Rescale

In [27]:
#Delete
new_df=df[df['income'] < 130]

In [28]:
#Mark
df['outlier']=np.where(df['income'] > 130,'outlier','normal')

In [29]:
df

Unnamed: 0,year,state,income,age,bins,outlier
0,1986,born,0.0,0,0,normal
1,2003,Btech,0.0,17,0,normal
2,2007,Infosys,25.0,21,1,normal
3,2009,CAT,,23,1,normal
4,2010,Symbi,0.0,24,1,normal
5,2012,Tally,75.0,26,1,normal
6,2016,Sapient,120.0,30,2,normal
7,2018,AI,145.0,32,2,outlier


In [30]:
#rescale
df['scaled_income']=[np.log(x)  for x in df['income']]

  


In [31]:
df

Unnamed: 0,year,state,income,age,bins,outlier,scaled_income
0,1986,born,0.0,0,0,normal,-inf
1,2003,Btech,0.0,17,0,normal,-inf
2,2007,Infosys,25.0,21,1,normal,3.218876
3,2009,CAT,,23,1,normal,
4,2010,Symbi,0.0,24,1,normal,-inf
5,2012,Tally,75.0,26,1,normal,4.317488
6,2016,Sapient,120.0,30,2,normal,4.787492
7,2018,AI,145.0,32,2,outlier,4.976734


### Impute missing values with mean

In [32]:
from sklearn.preprocessing import Imputer

In [33]:
# Create an imputer object that looks for 'Nan' values, then replaces them with the mean value of the feature by columns (axis=0)
mean_imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)

# Train the imputor on the df dataset
mean_imputer = mean_imputer.fit(df['income'].values.reshape(-1,1))

In [34]:
# Apply the imputer to the df dataset
imputed_income = mean_imputer.transform(df['income'].values.reshape(-1,1))

In [35]:
df['income']=imputed_income

In [36]:
df

Unnamed: 0,year,state,income,age,bins,outlier,scaled_income
0,1986,born,0.0,0,0,normal,-inf
1,2003,Btech,0.0,17,0,normal,-inf
2,2007,Infosys,25.0,21,1,normal,3.218876
3,2009,CAT,52.142857,23,1,normal,
4,2010,Symbi,0.0,24,1,normal,-inf
5,2012,Tally,75.0,26,1,normal,4.317488
6,2016,Sapient,120.0,30,2,normal,4.787492
7,2018,AI,145.0,32,2,outlier,4.976734


### Impute missing class labels with max. class frequency label

In [37]:
X = np.array([[0, 2.10, 1.45], 
              [1, 1.18, 1.33], 
              [0, 1.22, 1.27],
              [0, -0.21, -1.19],
              [np.nan, 0.87, 1.31],
              [np.nan, -0.67, -0.22]])

In [42]:
#create imputer object
imputer=Imputer(strategy='most_frequent',axis=0)
imputer.fit_transform(X)

array([[ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 0.  , -0.21, -1.19],
       [ 0.  ,  0.87,  1.31],
       [ 0.  , -0.67, -0.22]])

### Normalization

In [43]:
from sklearn.preprocessing import Normalizer

In [52]:
#create normalizer object
normlaiser=Normalizer(norm='l1')

normlaiser.fit_transform(df['income'].values.reshape(-1,1))

array([[ 0.],
       [ 0.],
       [ 1.],
       [ 1.],
       [ 0.],
       [ 1.],
       [ 1.],
       [ 1.]])

### Impute classes using KNearest neighbours

In [53]:
from sklearn.neighbors import KNeighborsClassifier

In [54]:
X = np.array([[0, 2.10, 1.45], 
              [1, 1.18, 1.33], 
              [0, 1.22, 1.27],
              [1, -0.21, -1.19]])

In [57]:
#Create feature matrix with missing values in the categorical feature
X_with_nan = np.array([[np.nan, 0.87, 1.31], 
                       [np.nan, -0.67, -0.22]])

### One hot encoding with multiple features

In [58]:
from sklearn.preprocessing import MultiLabelBinarizer

In [59]:
# Create NumPy array
y = [('Texas', 'Florida'), 
    ('California', 'Alabama'), 
    ('Texas', 'Florida'), 
    ('Delware', 'Florida'), 
    ('Texas', 'Alabama')]

In [61]:
one_hot=MultiLabelBinarizer()
one_hot.fit_transform(y)

array([[0, 0, 0, 1, 1],
       [1, 1, 0, 0, 0],
       [0, 0, 0, 1, 1],
       [0, 0, 1, 1, 0],
       [1, 0, 0, 0, 1]])

In [63]:
one_hot.classes_

array(['Alabama', 'California', 'Delware', 'Florida', 'Texas'], dtype=object)

In [64]:
one_hot.inverse_transform(one_hot.fit_transform(y))

[('Florida', 'Texas'),
 ('Alabama', 'California'),
 ('Florida', 'Texas'),
 ('Delware', 'Florida'),
 ('Alabama', 'Texas')]

### Rescale

In [66]:
scaler=preprocessing.MinMaxScaler(feature_range=(0,1))
scaler.fit_transform(df['income'].values.reshape(-1,1))

array([[ 0.        ],
       [ 0.        ],
       [ 0.17241379],
       [ 0.35960591],
       [ 0.        ],
       [ 0.51724138],
       [ 0.82758621],
       [ 1.        ]])

### Standardize

In [67]:
stand=preprocessing.StandardScaler()
stand.fit_transform(df['income'].values.reshape(-1,1))

array([[ -9.80108611e-01],
       [ -9.80108611e-01],
       [ -5.10193523e-01],
       [  1.33557901e-16],
       [ -9.80108611e-01],
       [  4.29636651e-01],
       [  1.27548381e+00],
       [  1.74539890e+00]])