# Data Preprocessing

## Dealing with missing data: dropna

In [213]:
import pandas as pd
from io import StringIO
csv_data = '''A, B, C, D
1.0, 2.0, 3.0, 4.0
5.0, 6.0,, 8.0
0.0, 11.0, 12.0,'''
csv_data = unicode(csv_data)
df = pd.read_csv(StringIO(csv_data))
#df.dropna(axis=0)
#df.dropna(axis=1)

## Dealing with missing data: Imputer

In [214]:
from sklearn.preprocessing import Imputer
imr = Imputer(missing_values='NaN', strategy='mean', axis=0)
imr.fit(df)
print df
print imr.transform(df.values)

   A   B   C   D
0  1   2   3   4
1  5   6 NaN   8
2  0  11  12 NaN
[[  1.    2.    3.    4. ]
 [  5.    6.    7.5   8. ]
 [  0.   11.   12.    6. ]]


## Dealing with catagorical data: Creating an example data

In [215]:
import pandas as pd
df = pd.DataFrame([
        ['green', 'M', 10.1, 'class1'],
        ['red', 'L', 13.5, 'class2'],
        ['blue', 'XL', 15.3, 'class1']])
df.columns=['color', 'size', 'price', 'classLabel']

## Dealing with catagorical data: Mapping ordinal features

In [216]:
size_mapping = {'XL': 3,
                'L' : 2,
                'M' : 1}
df['size'] = df['size'].map(size_mapping)
df

Unnamed: 0,color,size,price,classLabel
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


## Dealing with catagorical data: Encoding class labels using map

In [217]:
#import numpy as np
#class_mapping = {v: k for k, v in enumerate(np.unique(df['classLabel']))} 
#df['classLabel'] = df['classLabel'].map(class_mapping)
#df

## Dealing with catagorical data: Encoding class labels using LebelEncoder

In [218]:
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
y = class_le.fit_transform(df['classLabel'].values)
df['classLabel'] = y
df

Unnamed: 0,color,size,price,classLabel
0,green,1,10.1,0
1,red,2,13.5,1
2,blue,3,15.3,0


## Dealing with catagorical data: OneHotCoding on nominal features

In [219]:
from sklearn.preprocessing import OneHotEncoder
X = df.values
color_le = LabelEncoder()
colors = color_le.fit_transform(X[:,0])
X[:,0] = colors
ohe = OneHotEncoder(categorical_features=[0])
X = ohe.fit_transform(X).toarray()
print X

[[  0.    1.    0.    1.   10.1   0. ]
 [  0.    0.    1.    2.   13.5   1. ]
 [  1.    0.    0.    3.   15.3   0. ]]


## Dealing with catagorical data: get_dummies on nominal features

In [220]:
# Note that the columns for color will automaticall be at the end
pd.get_dummies(df[['color', 'price', 'size', 'classLabel']])

Unnamed: 0,price,size,classLabel,color_blue,color_green,color_red
0,10.1,1,0,0,1,0
1,13.5,2,1,0,0,1
2,15.3,3,0,1,0,0


## Wine dataset: Partitioning test and train data

In [221]:
df_wine = pd.read_csv('/home/shantanu/PycharmProjects/PythonMLBook/Chapter4/wine.csv')
df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash', 
                   'Alcalinity of ash', 'Magnesium', 'Total phenols', 
                   'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 
                   'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline']
from sklearn.cross_validation import train_test_split
X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:,0].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

## Wine dataset: Feature scaling with MinMaxScaler

In [222]:
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
mms.fit(X_test)
X_train_norm_mms = mms.transform(X_train)
X_test_norm_mms  = mms.transform(X_test)

## Wine dataset: Feature scaling with StandardScaler

In [223]:
from sklearn.preprocessing import StandardScaler
std = StandardScaler()
std.fit(X_test)
X_train_std = std.transform(X_train)
X_test_std  = std.transform(X_test)

## Wine dataset: Logistic regression with L1 norm regularization 

In [224]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty='l1', C=0.1)
lr.fit(X_train_std, y_train)
print('Training accuracy: ', lr.score(X_train_std, y_train))
print('Testing accuracy: ',  lr.score(X_test_std,  y_test))

('Training accuracy: ', 0.97580645161290325)
('Testing accuracy: ', 0.98148148148148151)


## Feature selection: Sequential Backward Selection

In [237]:
from sklearn.metrics import accuracy_score
class SBS():
    def __init__(self, estimator, k_features, 
                 scoring = accuracy_score, test_size=0.25,
                 random_state = 1):
        self.estimator      = estimator
        self.k_features     = k_features 
        self.scoring        = scoring
        self.test_size      = test_size
        self.random_state   = random_state
    
    def fit(self, X, y):
        # Finds the indeces of the features which needs to be kept
        X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                            test_size = self.test_size, 
                                                            random_state=self.random_state)
        std = StandardScaler()
        std.fit(X_train)
        X_train_std = std.transform(X_train)
        X_test_std  = std.transform(X_test)
        
        self.dim_ = X.shape[1]
        self.all_indices_remaining_ = np.arange(self.dim_)
        dim = self.dim_
        
        while dim > self.k_features:
            dictFeatureScore = {}
            for idx, feature in enumerate(self.all_indices_remaining_):
                currentFeatures = np.delete(self.all_indices_remaining_, idx)
                score = self._calc_score(X_train, X_test, y_train, y_test, currentFeatures)
                dictFeatureScore[feature] = score
            
            feature_with_max_score = self._get_key_with_max_score(dictFeatureScore)
            feature_index = np.where(self.all_indices_remaining_==feature_with_max_score)[0]
            self.all_indices_remaining_ = np.delete(self.all_indices_remaining_, feature_index)
            dim = dim - 1
            print self.all_indices_remaining_
            

            
    def _get_key_with_max_score(self, dictFeatureScore):
        k = list(dictFeatureScore.keys())
        v = list(dictFeatureScore.values())
        return k[v.index(max(v))]
        
    def _calc_score(self, X_train, X_test, y_train, y_test, indices):
        self.estimator.fit(X_train[:, indices], y_train)
        y_pred = self.estimator.predict(X_test[:, indices])
        score = self.scoring(y_pred, y_test)
        return score
            
            

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=2)
sbs = SBS(knn, k_features=1)
#print X, y
sbs.fit(X, y)

print df_wine.columns
df_wine.columns[1:][[0, 2, 8, 10, 11]]

[ 0  1  2  3  4  5  6  7  8  9 10 11]
[ 0  1  2  3  5  6  7  8  9 10 11]
[ 0  2  3  5  6  7  8  9 10 11]
[ 0  2  5  6  7  8  9 10 11]
[ 0  2  5  6  7  8 10 11]
[ 0  2  6  7  8 10 11]
[ 0  2  7  8 10 11]
[ 0  2  8 10 11]
[ 0  2 10 11]
[ 0 10 11]
[ 0 11]
[0]
Index([u'Class label', u'Alcohol', u'Malic acid', u'Ash', u'Alcalinity of ash',
       u'Magnesium', u'Total phenols', u'Flavanoids', u'Nonflavanoid phenols',
       u'Proanthocyanins', u'Color intensity', u'Hue',
       u'OD280/OD315 of diluted wines', u'Proline'],
      dtype='object')


Index([u'Alcohol', u'Ash', u'Proanthocyanins', u'Hue',
       u'OD280/OD315 of diluted wines'],
      dtype='object')