### Team Member

*   Simaa Abumousa



In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score as auc
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OrdinalEncoder

In [2]:
train = pd.read_csv("../input/cat-in-the-dat/train.csv")
test = pd.read_csv("../input/cat-in-the-dat/test.csv")

##### Dropping bin_0
##### Ordinal encoding for the "ord" columns
##### One-hot encoding for "nom", "day" and "month" columns

In [3]:
target = train['target']
train_id = train['id']
test_id = test['id']
train.drop(['target', 'id'], axis=1, inplace=True)
test.drop('id', axis=1, inplace=True)

mapper_bin_3 = {'T': 1, 'F': 0}
mapper_bin_4 = {'Y': 1, 'N': 0}

mapper_ord_1 = {'Novice': 1, 
                'Contributor': 2,
                'Expert': 3, 
                'Master': 4, 
                'Grandmaster': 5}
mapper_ord_2 = {'Freezing': 1, 
                'Cold': 2, 
                'Warm': 3, 
                'Hot': 4,
                'Boiling Hot': 5, 
                'Lava Hot': 6}
mapper_ord_3 = {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 
                'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15}
mapper_ord_4 = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7, 'H': 8, 
                'I': 9, 'J': 10, 'K': 11, 'L': 12, 'M': 13, 'N': 14, 'O': 15,
                'P': 16, 'Q': 17, 'R': 18, 'S': 19, 'T': 20, 'U': 21, 'V': 22, 
                'W': 23, 'X': 24, 'Y': 25, 'Z': 26}

all_data = pd.concat([train, test])

In [4]:
all_data['day'] = all_data['day'].apply(str)
all_data['month'] = all_data['month'].apply(str)

all_data['bin_3_oe'] = all_data['bin_3'].replace(mapper_bin_3)
all_data['bin_4_oe'] = all_data['bin_4'].replace(mapper_bin_4)
all_data['ord_1_oe'] = all_data['ord_1'].replace(mapper_ord_1)
all_data['ord_2_oe'] = all_data['ord_2'].replace(mapper_ord_2)
all_data['ord_3_oe'] = all_data['ord_3'].replace(mapper_ord_3)
all_data['ord_4_oe'] = all_data['ord_4'].replace(mapper_ord_4)

In [5]:
#all_data['day']

In [6]:
ordinal_encoder = OrdinalEncoder()   
data_ord_encoded = ordinal_encoder.fit_transform(all_data[['ord_5']])
data_ord_encoded_PD = pd.DataFrame(data_ord_encoded, dtype="int64")
data_ord_encoded_PD.rename(columns={0:'ord_5'}, inplace=True)

In [7]:
# drop bin_0
all_data.drop(["bin_0", "bin_3", "bin_4", 'ord_1','ord_2','ord_3','ord_4','ord_5'], axis=1, inplace=True)
all_data = all_data.reset_index()
all_data = pd.concat([all_data, data_ord_encoded_PD], axis=1)
all_data.drop("index", axis=1, inplace=True)

In [8]:
%%time
# One Hot Encode
dummies = pd.get_dummies(all_data, drop_first=False, sparse=True, 
                        columns=['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4', 'nom_5','nom_7', 'nom_6', 'nom_8','nom_9',
                                'day', 'month'])
train_ohe = dummies.iloc[:train.shape[0], :]
test_ohe = dummies.iloc[train.shape[0]:, :]

print(train_ohe.shape)
print(test_ohe.shape)

train_ohe = train_ohe.astype(pd.SparseDtype("int", 0))
test_ohe = test_ohe.astype(pd.SparseDtype("int", 0))

train_ohe = train_ohe.sparse.to_coo().tocsr()
test_ohe = test_ohe.sparse.to_coo().tocsr()

(300000, 16305)
(200000, 16305)
CPU times: user 5min 9s, sys: 739 ms, total: 5min 9s
Wall time: 5min 10s


In [9]:
param={'n_estimators':100000,
     'boosting_type':'gbdt',
     'learning_rate':0.1,
     'max_depth':3,
      'metric':'auc',
     'min_data_in_leaf':30,
      'min_split_gain':0.5,
     'num_leaves':10,
     'objective':'binary',
     'random_state':42,
     'subsample':0.7
     }

In [10]:
model=LogisticRegression(C=0.09968474250024324, class_weight={0:1, 1:1.3267279323409777},max_iter=10000, solver='liblinear')
model.fit(train_ohe, target)
predictions=model.predict_proba(test_ohe)[:,1]
submission = pd.DataFrame({'id': test_id, 'target': predictions})
submission.to_csv('submission.csv', index=False)
submission.head()

Unnamed: 0,id,target
0,300000,0.407673
1,300001,0.748431
2,300002,0.154437
3,300003,0.50489
4,300004,0.895854
