
[Origin]("https://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn")
Futher reading LabelEncoderwithScikitlearn.ipynb 


In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from fancyimpute import MICE
import fancyimpute as fi

In [2]:
# this class to apply for multi columns
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

### Load titanic datasets

In [179]:
train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
train = pd.read_csv(train_url)
#train=pd.read_csv('train.csv')
test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv"
test= pd.read_csv(test_url)
df=pd.concat([train,test],axis=0)
df.head(3)

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0.0,A/5 21171
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1.0,PC 17599
2,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1.0,STON/O2. 3101282


In [180]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
Age            1046 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
Fare           1308 non-null float64
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       891 non-null float64
Ticket         1309 non-null object
dtypes: float64(3), int64(4), object(5)
memory usage: 132.9+ KB


## Step 1: imputing missing values, see complete reference preprocessing_data.ipynb

In [181]:
# Drop ticket and passengerId columns
df.drop('Ticket',axis=1,inplace=True)
#df.drop('PassengerId',axis=1,inplace=True)
key=df[['PassengerId']]
df.isnull().sum()

Age             263
Cabin          1014
Embarked          2
Fare              1
Name              0
Parch             0
PassengerId       0
Pclass            0
Sex               0
SibSp             0
Survived        418
dtype: int64

In [182]:
# Fill missing value of Fare attribe by mean on group Pclass
df['Fare'].fillna(df.groupby("Pclass")["Fare"].transform("mean"), inplace=True)
#Fill missing values of embarked with a most frequence columns' value
df['Embarked'].fillna(df['Embarked'].value_counts().index[0],inplace=True)
# Extract left character from Cabin column
df['Cabin']=df['Cabin'].str[0]

In [183]:
df['Name']=df['Name'].str.partition(',')[2]
df['Name']=df['Name'].str.partition('.')[0]
df['Name']=df['Name'].str.strip()
df.rename(columns={'Name':'Title'},inplace=True)
df['Title'].value_counts()

Mr              757
Miss            260
Mrs             197
Master           61
Rev               8
Dr                8
Col               4
Mlle              2
Major             2
Ms                2
Jonkheer          1
Mme               1
the Countess      1
Lady              1
Sir               1
Dona              1
Capt              1
Don               1
Name: Title, dtype: int64

In [184]:
df['Title'].replace({'Dr':'Mr','Rev':'Mr'},inplace=True)
df['Title'].replace(to_replace=['Dona', 'Lady', 'the Countess'],value='Mrs',inplace=True)
df['Title'].replace(to_replace=['Jonkheer','Master','Capt', 'Don', 'Major', 'Col'],value='Sir',inplace=True)
df['Title'].replace(to_replace=['Mlle', 'Ms','Mme'],value='Miss',inplace=True)
df['Title'].value_counts()

Mr      773
Miss    265
Mrs     200
Sir      71
Name: Title, dtype: int64

In [185]:
# split data into numeric and categorical sets
def split_categories(df):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    df_n = df.select_dtypes(include=numerics)
    df_c = df.select_dtypes(exclude=numerics)
    
    return df_n, df_c
df_n,df_c= split_categories(df)

In [186]:
# Check numeric set
df_n.head(3)

Unnamed: 0,Age,Fare,Parch,PassengerId,Pclass,SibSp,Survived
0,22.0,7.25,0,1,3,1,0.0
1,38.0,71.2833,0,2,1,1,1.0
2,26.0,7.925,0,3,3,0,1.0


In [187]:
# Transform a whole categorical subset
temp=df_c[['Cabin']]
df_c=df_c[['Embarked','Title','Sex']].apply(LabelEncoder().fit_transform)
df_c=pd.concat([df_c,temp,key],axis=1)
df_c.head(3)

Unnamed: 0,Embarked,Title,Sex,Cabin,PassengerId
0,2,1,1,,1
1,0,2,0,C,2
2,2,0,0,,3


In [188]:
# split into train and test set with Cabin
df_c_train= df_c[df_c['Cabin'].notnull()]
df_c_test = df_c[df_c['Cabin'].isnull()]
df_c_y= df_c_train[['Cabin']]  # get train label
df_c_y.head(3)

Unnamed: 0,Cabin
1,C
3,C
6,E


In [189]:
df_c_train.head(3)

Unnamed: 0,Embarked,Title,Sex,Cabin,PassengerId
1,0,2,0,C,2
3,2,2,0,C,4
6,2,1,1,E,7


In [190]:
#df['Cabin'].loc[df['Cabin'].notnull()]=
df_c_test.head(3)

Unnamed: 0,Embarked,Title,Sex,Cabin,PassengerId
0,2,1,1,,1
2,2,0,0,,3
4,2,1,1,,5


In [191]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(df_c_train[df_c_train.columns.difference(['Cabin','PassengerId'])], df_c_train['Cabin'].str[0])
pred=rf.predict(df_c_test[df_c_test.columns.difference(['Cabin','PassengerId'])])

In [192]:
# Need to modify 
df_c_test.loc[:,'Cabin']=pred


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [193]:
df_c_test.head(3)

Unnamed: 0,Embarked,Title,Sex,Cabin,PassengerId
0,2,1,1,C,1
2,2,0,0,C,3
4,2,1,1,C,5


In [194]:
df_c=pd.concat([df_c_train,df_c_test],ignore_index=True)
df_c.head(3)

Unnamed: 0,Embarked,Title,Sex,Cabin,PassengerId
0,0,2,0,C,2
1,2,2,0,C,4
2,2,1,1,E,7


In [195]:
df_n.head(3)

Unnamed: 0,Age,Fare,Parch,PassengerId,Pclass,SibSp,Survived
0,22.0,7.25,0,1,3,1,0.0
1,38.0,71.2833,0,2,1,1,1.0
2,26.0,7.925,0,3,3,0,1.0


In [196]:
df=pd.merge(df_c,df_n,on="PassengerId")
df['Cabin']=df[['Cabin']].apply(LabelEncoder().fit_transform)
df.head(3)

Unnamed: 0,Embarked,Title,Sex,Cabin,PassengerId,Age,Fare,Parch,Pclass,SibSp,Survived
0,0,2,0,2,2,38.0,71.2833,0,1,1,1.0
1,2,2,0,2,4,35.0,53.1,0,1,1,1.0
2,2,1,1,4,7,54.0,51.8625,0,1,0,0.0


In [197]:
df.isnull().sum()

Embarked         0
Title            0
Sex              0
Cabin            0
PassengerId      0
Age            263
Fare             0
Parch            0
Pclass           0
SibSp            0
Survived       418
dtype: int64

In [198]:
# Now we drop PassengerId 
df.drop('PassengerId',axis=1,inplace=True)

In [199]:
df.head(3)

Unnamed: 0,Embarked,Title,Sex,Cabin,Age,Fare,Parch,Pclass,SibSp,Survived
0,0,2,0,2,38.0,71.2833,0,1,1,1.0
1,2,2,0,2,35.0,53.1,0,1,1,1.0
2,2,1,1,4,54.0,51.8625,0,1,0,0.0


## Fill missing age with fancyimpute

In [200]:
from fancyimpute import KNN

In [201]:
df.columns.values

array(['Embarked', 'Title', 'Sex', 'Cabin', 'Age', 'Fare', 'Parch',
       'Pclass', 'SibSp', 'Survived'], dtype=object)

In [202]:
target=df[['Survived']]
target.head(3)

Unnamed: 0,Survived
0,1.0
1,1.0
2,0.0


In [203]:
df.drop('Survived',axis=1,inplace=True)
df.head(3)

Unnamed: 0,Embarked,Title,Sex,Cabin,Age,Fare,Parch,Pclass,SibSp
0,0,2,0,2,38.0,71.2833,0,1,1
1,2,2,0,2,35.0,53.1,0,1,1
2,2,1,1,4,54.0,51.8625,0,1,0


In [204]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 1308
Data columns (total 9 columns):
Embarked    1309 non-null int64
Title       1309 non-null int64
Sex         1309 non-null int64
Cabin       1309 non-null int64
Age         1046 non-null float64
Fare        1309 non-null float64
Parch       1309 non-null int64
Pclass      1309 non-null int64
SibSp       1309 non-null int64
dtypes: float64(2), int64(7)
memory usage: 102.3 KB


In [205]:
# Filter numeric data from missing value data frame, convert to numpy array with as_matrix required by fancyimpute

df_numeric = df.select_dtypes(include=['int16', 'int32', 'int64', 'float16', 'float32', 'float64'])
df_filled = pd.DataFrame(KNN(3).complete(df_numeric.as_matrix()))

Imputing row 1/1309 with 0 missing, elapsed time: 0.425
Imputing row 101/1309 with 0 missing, elapsed time: 0.426
Imputing row 201/1309 with 0 missing, elapsed time: 0.427
Imputing row 301/1309 with 0 missing, elapsed time: 0.427
Imputing row 401/1309 with 0 missing, elapsed time: 0.428
Imputing row 501/1309 with 0 missing, elapsed time: 0.429
Imputing row 601/1309 with 0 missing, elapsed time: 0.430
Imputing row 701/1309 with 0 missing, elapsed time: 0.431
Imputing row 801/1309 with 0 missing, elapsed time: 0.432
Imputing row 901/1309 with 0 missing, elapsed time: 0.433
Imputing row 1001/1309 with 0 missing, elapsed time: 0.434
Imputing row 1101/1309 with 0 missing, elapsed time: 0.435
Imputing row 1201/1309 with 0 missing, elapsed time: 0.436
Imputing row 1301/1309 with 0 missing, elapsed time: 0.437


In [206]:
df_filled.columns= df_numeric.columns
df_filled.index= df_numeric.index
df_filled.head()

Unnamed: 0,Embarked,Title,Sex,Cabin,Age,Fare,Parch,Pclass,SibSp
0,0.0,2.0,0.0,2.0,38.0,71.2833,0.0,1.0,1.0
1,2.0,2.0,0.0,2.0,35.0,53.1,0.0,1.0,1.0
2,2.0,1.0,1.0,4.0,54.0,51.8625,0.0,1.0,0.0
3,2.0,0.0,0.0,6.0,4.0,16.7,1.0,3.0,1.0
4,2.0,0.0,0.0,2.0,58.0,26.55,0.0,1.0,0.0


In [212]:
df=pd.concat([df_filled,target],axis=1)
df

Unnamed: 0,Embarked,Title,Sex,Cabin,Age,Fare,Parch,Pclass,SibSp,Survived
0,0.0,2.0,0.0,2.0,38.000000,71.2833,0.0,1.0,1.0,1.0
1,2.0,2.0,0.0,2.0,35.000000,53.1000,0.0,1.0,1.0,1.0
2,2.0,1.0,1.0,4.0,54.000000,51.8625,0.0,1.0,0.0,0.0
3,2.0,0.0,0.0,6.0,4.000000,16.7000,1.0,3.0,1.0,1.0
4,2.0,0.0,0.0,2.0,58.000000,26.5500,0.0,1.0,0.0,1.0
5,2.0,1.0,1.0,3.0,34.000000,13.0000,0.0,2.0,0.0,1.0
6,2.0,1.0,1.0,0.0,28.000000,35.5000,0.0,1.0,0.0,1.0
7,2.0,1.0,1.0,2.0,19.000000,263.0000,2.0,1.0,3.0,0.0
8,0.0,2.0,0.0,1.0,55.994543,146.5208,0.0,1.0,1.0,1.0
9,0.0,2.0,0.0,3.0,49.000000,76.7292,0.0,1.0,1.0,1.0
