In [1]:
import pandas as pd
from sklearn import preprocessing

In [2]:
# Read training data
TRAIN_FILEPATH="../input/train.csv"
TEST_FILEPATH="../input/train.csv"

# Read in train and test data
df_train=pd.read_csv(TRAIN_FILEPATH)
df_test=pd.read_csv(TEST_FILEPATH)

In [3]:
# Create fake target column for test data since this column
# doesnt exist
df_test.loc[:,'target']=-1

# Concatenate training and test data
df_all = pd.concat([df_train, df_test]).reset_index(drop=True)

In [5]:
# Make a list of features we are interested in and 
# id and target are not columns we want to encode
features = [col for col in df_all.columns if col not in ['id', 'target']]
features

['bin_0',
 'bin_1',
 'bin_2',
 'bin_3',
 'bin_4',
 'nom_0',
 'nom_1',
 'nom_2',
 'nom_3',
 'nom_4',
 'nom_5',
 'nom_6',
 'nom_7',
 'nom_8',
 'nom_9',
 'ord_0',
 'ord_1',
 'ord_2',
 'ord_3',
 'ord_4',
 'ord_5',
 'day',
 'month']

In [6]:
# Loop over the features list
for feat in features:
    # Create new instance of label encoder for each feature
    lbl_enc=preprocessing.LabelEncoder()
    
    # We fillna with a string, and then convert all data, whether int or 
    # float or string, to a string type
    temp_col=df_all[feat].fillna('NONE').astype(str).values

    
    # We can use any fit transform here as we do not have any
    # extra test data here that we need to transform on separately
    # because we concatenated the data
    df_all.loc[:,feat]=lbl_enc.fit_transform(temp_col)
    

In [7]:
# Split the training and test data again

df_train = df_all[df_all.target != -1].reset_index(drop=True)
df_test = df_all[df_all.target == -1].reset_index(drop=True)

In [33]:
# This will not work in a live setting, in a live setting, we can set
# unrecognized types to a RARE data type

# First, we need to have a criterion for what counts as rare
# lets say that it's count less than 2000 (for this data set)

df_all.ord_4.fillna("NONE").reset_index(drop=True)


0          21
1          24
2          16
3           2
4           2
           ..
1199995    18
1199996    13
1199997     7
1199998    24
1199999    15
Name: ord_4, Length: 1200000, dtype: int64

In [34]:
df_all.loc[
    df_all['ord_4'].value_counts()[df_all['ord_4']] < 2000,
    'ord_4'
] = "RARE"


ValueError: cannot reindex from a duplicate axis

In [31]:
df

21    False
24    False
16    False
2     False
2     False
      ...  
18    False
13    False
7     False
24    False
15    False
Name: ord_4, Length: 1200000, dtype: bool