# Example of transforming categories into numeric with Vote dataset

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder, Imputer, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
#https://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [3]:
df = pd.read_csv("c:/dataset/house_votes_84.csv")
df.head(3)

Unnamed: 0,republican,n,y,n.1,y.1,y.2,y.3,n.2,n.3,n.4,y.4,?,y.5,y.6,y.7,n.5,y.8
0,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,?
1,democrat,?,y,y,?,y,y,n,n,n,n,y,n,y,y,n,n
2,democrat,n,y,y,n,?,y,n,n,n,n,y,n,y,n,n,y


### Alternative, convert missing values with nan while reading
df=pd.read_csv('c:/dataset/house_votes_84.csv', na_values=['?'])

In [4]:
# df = df.replace('?', np.NaN) 
df[df=='?']=np.NaN
df.head(3)

Unnamed: 0,republican,n,y,n.1,y.1,y.2,y.3,n.2,n.3,n.4,y.4,?,y.5,y.6,y.7,n.5,y.8
0,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,
1,democrat,,y,y,,y,y,n,n,n,n,y,n,y,y,n,n
2,democrat,n,y,y,n,,y,n,n,n,n,y,n,y,n,n,y


In [5]:
df.isnull().sum()

republican      0
n              12
y              48
n.1            11
y.1            11
y.2            15
y.3            11
n.2            14
n.3            15
n.4            22
y.4             7
?              20
y.5            31
y.6            25
y.7            17
n.5            28
y.8           104
dtype: int64

In [28]:
# split into numeric and categorical groups
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
df_num = df.select_dtypes(include=numerics)
df_cat = df.select_dtypes(exclude=numerics)
df_num.shape,df_cat.shape

((434, 0), (434, 17))

In [6]:
# Convert ? column to unknown column
df.rename(columns={'?':'unknown'},inplace=True)
df.columns.values

array(['republican', 'n', 'y', 'n.1', 'y.1', 'y.2', 'y.3', 'n.2', 'n.3',
       'n.4', 'y.4', 'unknown', 'y.5', 'y.6', 'y.7', 'n.5', 'y.8'], dtype=object)

### Approach 1: Most frequently most on each column

In [75]:
## For missing values, we fill missing by its most frequent values but it is not always a case
df1=df.copy()
df1=df1.apply(lambda x:x.fillna(x.value_counts().index[0]))
df1.head(3)

Unnamed: 0,republican,n,y,n.1,y.1,y.2,y.3,n.2,n.3,n.4,y.4,unknown,y.5,y.6,y.7,n.5,y.8
0,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,y
1,democrat,n,y,y,n,y,y,n,n,n,n,y,n,y,y,n,n
2,democrat,n,y,y,n,y,y,n,n,n,n,y,n,y,n,n,y


In [17]:
df1=MultiColumnLabelEncoder().fit_transform(df1)
df1.head()

Unnamed: 0,republican,n,y,n.1,y.1,y.2,y.3,n.2,n.3,n.4,y.4,unknown,y.5,y.6,y.7,n.5,y.8
0,1,0,1,0,1,1,1,0,0,0,0,0,1,1,1,0,1
1,0,0,1,1,0,1,1,0,0,0,0,1,0,1,1,0,0
2,0,0,1,1,0,1,1,0,0,0,0,1,0,1,0,0,1
3,0,1,1,1,0,1,1,0,0,0,0,1,0,1,1,1,1
4,0,0,1,1,0,1,1,0,0,0,0,0,0,1,1,1,1


In [47]:
X_train, X_test, y_train, y_test = train_test_split(df1[df1.columns.difference(['republican'])],df1[['republican']],test_size=0.3,random_state=42)
clf=SVC()
clf.fit(X_train,y_train.values.ravel())
y_pred= clf.predict(X_test)
print(classification_report(y_test,y_pred))

             precision    recall  f1-score   support

          0       0.99      0.93      0.96        83
          1       0.89      0.98      0.93        48

avg / total       0.95      0.95      0.95       131



In [52]:
# Here, we present y (=df1['republican']) as pandas series, then there is not need of np,ravel 
X_train, X_test, y_train, y_test = train_test_split(df1[df1.columns.difference(['republican'])],df1['republican'],test_size=0.3,random_state=42)
clf=SVC()
clf.fit(X_train,y_train)
y_pred= clf.predict(X_test)
print(classification_report(y_test,y_pred))

             precision    recall  f1-score   support

          0       0.99      0.93      0.96        83
          1       0.89      0.98      0.93        48

avg / total       0.95      0.95      0.95       131



In [49]:
print(accuracy_score(y_test,y_pred))

0.946564885496


In [98]:
df1.head()

Unnamed: 0,republican,n,y,n.1,y.1,y.2,y.3,n.2,n.3,n.4,y.4,unknown,y.5,y.6,y.7,n.5,y.8
0,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,y
1,democrat,n,y,y,n,y,y,n,n,n,n,y,n,y,y,n,n
2,democrat,n,y,y,n,y,y,n,n,n,n,y,n,y,n,n,y
3,democrat,y,y,y,n,y,y,n,n,n,n,y,n,y,y,y,y
4,democrat,n,y,y,n,y,y,n,n,n,n,n,n,y,y,y,y


## Using pipeline

In [89]:
mEn= MultiColumnLabelEncoder()
clf = SVC()

# Setup the pipeline with the required steps: steps
steps = [('encoding',mEn),('scaler', StandardScaler()),
         ('SVM',clf)]  
para = {'SVM__C':[1, 10, 100], 'SVM__gamma':[0.1, 0.01]}

## Using pipeline without hyper parameter search

In [101]:
mEn= MultiColumnLabelEncoder()
clf = SVC()

# Setup the pipeline with the required steps: steps
steps = [('encoding',mEn),('scaler', StandardScaler()),
         ('SVM',clf)] 

pipeline = Pipeline(steps)

# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(df1[df1.columns.difference(['republican'])],df1['republican'],test_size=0.3,random_state=42)
pipeline.fit(X_train,y_train)
#estimator.fit_transform(X_missing, y_missing)
# Predict the labels of the test set
y_pred = pipeline.predict(X_test)
print(classification_report(y_test,y_pred))

             precision    recall  f1-score   support

          0       0.99      0.93      0.96        83
          1       0.89      0.98      0.93        48

avg / total       0.95      0.95      0.95       131



### Using pipeline  with gridSearch
Except tree based model, many ML algorithm requires features on the same scale, common methods are ** normalization** and **standardization**. Normalization is used to rescale the feature to a range of [0,1] which is a special case of **min-max** scaling as $x_{scaled}=\frac{x-x_{min}}{x_{max}-x_{min}}$. E.g:      
> from sklearn.preprocessing import MinMaxScaler   
> mms= MinMaxScaler()   
> X_train_norm = mms.fit_transform(X_train)   
> X_test_norm = mms.transform(X_test)   

**Standardization** is computed as $x_{std}=\frac{x-\mu}{\sigma}$ in following steps   
> from sklearn.preprocessing import StandardScaler     
> stdsc = StandardScaler()   
> X_train_std = stdcsc.fit_transform(X_train)   
> X_test_std = stdsc.transform(X_test)  

In [103]:
df1=MultiColumnLabelEncoder().fit_transform(df1)

clf = SVC()
# Setup the pipeline with the required steps: steps
steps = [('scaler', StandardScaler()),
         ('SVM',clf)]  
para = {'SVM__C':[0.1,1, 10, 100], 'SVM__gamma':[0.5,0.1, 0.01]}
pipeline = Pipeline(steps)

# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(df1[df1.columns.difference(['republican'])],df1['republican'],\
                                                    test_size=0.3,random_state=42, stratify = df1['republican'])
#estimator.fit_transform(X_missing, y_missing)
# Predict the labels of the test set
cv = GridSearchCV(pipeline, para,cv=3)
cv.fit(X_train,y_train)
y_pred = cv.predict(X_test)
print(classification_report(y_test,y_pred))

             precision    recall  f1-score   support

          0       0.96      0.95      0.96        83
          1       0.92      0.94      0.93        48

avg / total       0.95      0.95      0.95       131



In [104]:
print("Accuracy: {}".format(cv.score(X_test, y_test)))
print(classification_report(y_test, y_pred))
print("Tuned Model Parameters: {}".format(cv.best_params_))


Accuracy: 0.9465648854961832
             precision    recall  f1-score   support

          0       0.96      0.95      0.96        83
          1       0.92      0.94      0.93        48

avg / total       0.95      0.95      0.95       131

Tuned Model Parameters: {'SVM__C': 10, 'SVM__gamma': 0.1}


## Approach 2: fill missing values by group

In [7]:
# Fill missing values by its party
df2=df.copy()
missing_cols= list(df2.columns[df2.isnull().any()])

In [8]:
df2[missing_cols]=df2.groupby('republican')[missing_cols].\
transform(lambda x:x.fillna(x.mode()[0]))
df2.head()

Unnamed: 0,republican,n,y,n.1,y.1,y.2,y.3,n.2,n.3,n.4,y.4,unknown,y.5,y.6,y.7,n.5,y.8
0,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,y
1,democrat,y,y,y,n,y,y,n,n,n,n,y,n,y,y,n,n
2,democrat,n,y,y,n,n,y,n,n,n,n,y,n,y,n,n,y
3,democrat,y,y,y,n,y,y,n,n,n,n,y,n,y,y,y,y
4,democrat,n,y,y,n,y,y,n,n,n,n,n,n,y,y,y,y


* Using Label Encoder

In [63]:
d = defaultdict(LabelEncoder)
#df2.apply(LabelEncoder().fit_transform)

fit = df2.apply(lambda x: d[x.name].fit_transform(x))
fit

Unnamed: 0,republican,n,y,n.1,y.1,y.2,y.3,n.2,n.3,n.4,y.4,unknown,y.5,y.6,y.7,n.5,y.8
0,1,0,1,0,1,1,1,0,0,0,0,1,1,1,0,1,0
1,0,1,1,1,0,1,1,0,0,0,0,0,1,1,0,0,1
2,0,0,1,1,0,0,1,0,0,0,0,0,1,0,0,1,1
3,0,1,1,1,0,1,1,0,0,0,0,0,1,1,1,1,1
4,0,0,1,1,0,1,1,0,0,0,0,0,1,1,1,1,0
5,0,0,1,0,1,1,1,0,0,0,0,0,0,1,1,1,0
6,1,0,1,0,1,1,1,0,0,0,0,0,1,1,0,1,0
7,1,0,1,0,1,1,1,0,0,0,0,1,1,1,0,1,0
8,0,1,1,1,0,0,0,1,1,1,0,0,0,0,1,1,0
9,1,0,1,0,1,1,0,0,0,0,0,1,1,1,0,0,0


# Approach 3: Using category type with dummy variable from pandas
## Note: Pandas provides Category type to convert label ( from object type) into category   
e.g: df.label.astpe('category')  


In [9]:
df=pd.read_csv('c:/dataset/house_votes_84.csv', na_values=['?'])
df.head()

Unnamed: 0,republican,n,y,n.1,y.1,y.2,y.3,n.2,n.3,n.4,y.4,?,y.5,y.6,y.7,n.5,y.8
0,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,
1,democrat,,y,y,,y,y,n,n,n,n,y,n,y,y,n,n
2,democrat,n,y,y,n,,y,n,n,n,n,y,n,y,n,n,y
3,democrat,y,y,y,n,y,y,n,n,n,n,y,,y,y,y,y
4,democrat,n,y,y,n,y,y,n,n,n,n,n,n,y,y,y,y


### Here, we fill missing values by most frequent values by each party

In [10]:
#df.fillna(df.groupby('republican')[df.drop('republican',axis=1).columns].transform("value_counts").index[0])
features = df.drop('republican',axis=1).columns
df[features]=df.groupby('republican')[features].transform(lambda x:x.fillna(x.mode()[0]))
df.head()

Unnamed: 0,republican,n,y,n.1,y.1,y.2,y.3,n.2,n.3,n.4,y.4,?,y.5,y.6,y.7,n.5,y.8
0,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,y
1,democrat,y,y,y,n,y,y,n,n,n,n,y,n,y,y,n,n
2,democrat,n,y,y,n,n,y,n,n,n,n,y,n,y,n,n,y
3,democrat,y,y,y,n,y,y,n,n,n,n,y,n,y,y,y,y
4,democrat,n,y,y,n,y,y,n,n,n,n,n,n,y,y,y,y


In [11]:
df.rename(columns={'republican':'party','?':'unknown'},inplace=True)
df.columns

Index(['party', 'n', 'y', 'n.1', 'y.1', 'y.2', 'y.3', 'n.2', 'n.3', 'n.4',
       'y.4', 'unknown', 'y.5', 'y.6', 'y.7', 'n.5', 'y.8'],
      dtype='object')

> Pandas method get_dummies convert all string columns into numeric columns and leave other columns unchanged but notice

In [51]:
df=pd.concat([df[['party']],pd.get_dummies(df.drop('party',axis=1))],axis=1)
df.head()

Unnamed: 0,party,n_n,n_y,y_n,y_y,n.1_n,n.1_y,y.1_n,y.1_y,y.2_n,...,y.5_n,y.5_y,y.6_n,y.6_y,y.7_n,y.7_y,n.5_n,n.5_y,y.8_n,y.8_y
0,republican,1,0,0,1,1,0,0,1,0,...,0,1,0,1,0,1,1,0,0,1
1,democrat,0,1,0,1,0,1,1,0,0,...,1,0,0,1,0,1,1,0,1,0
2,democrat,1,0,0,1,0,1,1,0,1,...,1,0,0,1,1,0,1,0,0,1
3,democrat,0,1,0,1,0,1,1,0,0,...,1,0,0,1,0,1,0,1,0,1
4,democrat,1,0,0,1,0,1,1,0,0,...,1,0,0,1,0,1,0,1,0,1


> Recommend: one-hot-encoding approach above may introduces multicollinearity which can be issue for matrix inversion. We could reduce the correlation among variable by dropping one column after implementing one-hot encodeer approach  In fact,remaining columns still guarantee to represent the original information.

In [13]:
df=pd.concat([df[['party']],pd.get_dummies(df.drop('party',axis=1),drop_first=True)],axis=1)
df.head()

Unnamed: 0,party,n_y,y_y,n.1_y,y.1_y,y.2_y,y.3_y,n.2_y,n.3_y,n.4_y,y.4_y,unknown_y,y.5_y,y.6_y,y.7_y,n.5_y,y.8_y
0,republican,0,1,0,1,1,1,0,0,0,0,0,1,1,1,0,1
1,democrat,1,1,1,0,1,1,0,0,0,0,1,0,1,1,0,0
2,democrat,0,1,1,0,0,1,0,0,0,0,1,0,1,0,0,1
3,democrat,1,1,1,0,1,1,0,0,0,0,1,0,1,1,1,1
4,democrat,0,1,1,0,1,1,0,0,0,0,0,0,1,1,1,1


In [14]:
df.party=df.party.astype('category')
df.party.head(2)

0    republican
1      democrat
Name: party, dtype: category
Categories (2, object): [democrat, republican]

### Note: stratify option allows the same proportion of claas labels in both training and testing

In [17]:
clf = SVC()
# Setup the pipeline with the required steps: steps
steps = [('scaler', StandardScaler()),
         ('SVM',clf)]  
para = {'SVM__C':[0.1,1, 10, 100], 'SVM__gamma':[0.5,0.1, 0.01]}
pipeline = Pipeline(steps)

# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(df[df.columns.difference(['party'])],df['party'],test_size=0.3,\
                                                    random_state=42,stratify=df['party'])
#estimator.fit_transform(X_missing, y_missing)
# Predict the labels of the test set
cv = GridSearchCV(pipeline, para,cv=3)
cv.fit(X_train,y_train)
y_pred = cv.predict(X_test)
print(classification_report(y_test,y_pred))          

             precision    recall  f1-score   support

   democrat       0.98      0.99      0.98        81
 republican       0.98      0.96      0.97        50

avg / total       0.98      0.98      0.98       131

