In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the data
X = pd.read_csv("/Users/shahriar/Desktop/Code/DATA Sci/robi-datathon-2-pre-assessment/train.csv", index_col='id') 
X_test = pd.read_csv("/Users/shahriar/Desktop/Code/DATA Sci/robi-datathon-2-pre-assessment/test.csv", index_col='id')

# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['label'], inplace=True)
y = X.label
X.drop(['label'], axis=1, inplace=True)

# To keep things simple, we'll drop columns with missing values
cols_with_missing = [col for col in X.columns if X[col].isnull().any()] 
X.drop(cols_with_missing, axis=1, inplace=True)
X_test.drop(cols_with_missing, axis=1, inplace=True)

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
                                                      train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [2]:
X_train.head()

Unnamed: 0_level_0,gender,s11,s12,s13,s16,s17,s18,s48,s52,s53,...,n6,n7,n8,n9,n10,n11,n12,n13,n14,n15
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
b'gAAAAABinOiWIha1c_SbUb_NQBfjuB0TIIZ2xoTn53btfCZgNPVZZRY-hCJP5Re6WTd1xWtZEMzfZb2bxjkdmxFYUUO31lg_zw==',F,Y,Y,1,C,D,B,0,1,,...,0.018791,-9.375363,1.399308,7.822798,3.480364,1.861671,0,0,0.973902,4
b'gAAAAABinOiZb5nXh2weXxI1-6ZKWFnkSuR_bEMzHv7ltnffCMm_AVAiPe_hBcuuPCHimy2hR-9ySM-uUOl0DSDVpKg1bSza_w==',M,N,Y,0,B,D,B,1,1,,...,0.006907,-9.230582,1.462323,2.87999,4.578135,1.795965,0,0,0.410341,1
b'gAAAAABinOiYvpHr2ws-N-KCoIzytjLPWKRf1wcuNanrSg7HXRVaVXEwcr0DjYVkC7V7s1Cjv4Xyq4hvTMmw8BN2zHeKdLWPcg==',M,Y,Y,1,D,D,B,1,1,,...,0.005607,-9.058071,1.495906,3.146324,9.531869,1.787582,0,1,0.360592,6
b'gAAAAABinOiWPzrC_h6nH0wu3k6S09WGtcxz8LWV3Vn9Sz48Gw9OITVKhsICIRaWJWqNAbXTByvIrFmxGCjgXimwudufM-WKuA==',M,N,Y,1,B,C,B,1,1,,...,0.015804,-9.284329,1.542638,6.272382,4.872482,1.569813,0,0,0.948543,2
b'gAAAAABinOiX2ss5U2ZT5zFd8zf7HcBDWPvbivNz5wAy6zrd3qi2I3cE2DHE2japRI_8efkRXs2QiAWxsET85tn6ifcWYCjfdA==',M,Y,Y,1,D,D,B,0,1,,...,0.007216,-8.651173,1.87746,3.637551,11.404942,1.862441,0,0,0.019793,5


In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [4]:
drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_valid = X_valid.select_dtypes(exclude=['object'])

In [5]:
print("MAE from Approach 1 (Drop categorical variables):")
print(score_dataset(drop_X_train, drop_X_valid, y_train, y_valid))

MAE from Approach 1 (Drop categorical variables):
0.21903971756398943


In [8]:
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]

# Columns that can be safely ordinal encoded
good_label_cols = [col for col in object_cols if 
                   set(X_valid[col]).issubset(set(X_train[col]))]
        
# Problematic columns that will be dropped from the dataset
bad_label_cols = list(set(object_cols)-set(good_label_cols))
        
print('Categorical columns that will be ordinal encoded:', good_label_cols)
print('\nCategorical columns that will be dropped from the dataset:', bad_label_cols)

Categorical columns that will be ordinal encoded: ['gender', 's11', 's12', 's16', 's17', 's18', 's52', 's53', 's58', 's69', 's70', 's71']

Categorical columns that will be dropped from the dataset: []


In [10]:
from sklearn.preprocessing import OrdinalEncoder

# Drop categorical columns that will not be encoded
label_X_train = X_train.drop(bad_label_cols, axis=1)
label_X_valid = X_valid.drop(bad_label_cols, axis=1)

# Apply ordinal encoder  # Your code here
ordinal_encoder = OrdinalEncoder()
label_X_train[good_label_cols] = ordinal_encoder.fit_transform(X_train[good_label_cols])
label_X_valid[good_label_cols] = ordinal_encoder.transform(X_valid[good_label_cols])
     

In [11]:
print("MAE from Approach 2 (Ordinal Encoding):") 
print(score_dataset(label_X_train, label_X_valid, y_train, y_valid))

MAE from Approach 2 (Ordinal Encoding):
0.18982347749338044
