In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm_notebook
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [2]:
train = pd.read_csv('cat_in_dat/train.csv')
test = pd.read_csv('cat_in_dat/test.csv')
test["target"] = -1
data = pd.concat([train, test]).reset_index(drop=True)

In [3]:
def convert_data_to_numeric(df):
    
    bin_3_mapping = {'T':1 , 'F':0}
    bin_4_mapping = {'Y':1 , 'N':0}
    nom_0_mapping = {'Red' : 0, 'Blue' : 1, 'Green' : 2}
    nom_1_mapping = {'Trapezoid' : 0, 'Star' : 1, 'Circle': 2, 'Triangle' : 3, 'Polygon' : 4}
    nom_2_mapping = {'Hamster' : 0 , 'Axolotl' : 1, 'Lion' : 2, 'Dog' : 3, 'Cat' : 4, 'Snake' : 5}
    nom_3_mapping = {'Russia' : 0, 'Canada' : 1, 'Finland' : 2, 'Costa Rica' : 3, 'China' : 4, 'India' : 5}
    nom_4_mapping = {'Bassoon' : 0, 'Theremin' : 1, 'Oboe' : 2, 'Piano' : 3}
    nom_5_mapping = dict(zip((df.nom_5.dropna().unique()), range(len((df.nom_5.dropna().unique())))))
    nom_6_mapping = dict(zip((df.nom_6.dropna().unique()), range(len((df.nom_6.dropna().unique())))))
    nom_7_mapping = dict(zip((df.nom_7.dropna().unique()), range(len((df.nom_7.dropna().unique())))))
    nom_8_mapping = dict(zip((df.nom_8.dropna().unique()), range(len((df.nom_8.dropna().unique())))))
    nom_9_mapping = dict(zip((df.nom_9.dropna().unique()), range(len((df.nom_9.dropna().unique())))))
    ord_1_mapping = {'Novice' : 0, 'Contributor' : 1, 'Expert' : 2, 'Master': 3, 'Grandmaster': 4}
    ord_2_mapping = { 'Freezing': 0, 'Cold': 1, 'Warm' : 2, 'Hot': 3, 'Boiling Hot' : 4, 'Lava Hot' : 5}
    ord_3_mapping = {'a':0, 'b':1, 'c':2 ,'d':3 ,'e':4, 'f':5, 'g':6, 'h':7, 'i':8, 'j':9, 'k':10, 'l':11, 'm':12, 'n':13, 'o':14}
    ord_4_mapping = {'A':0, 'B':1, 'C':2, 'D':3, 'E':4, 'F':5, 'G':6, 'H':7, 'I':8, 'J':9, 'K':10,'L':11,'M':12,
                 'N':13,'O':14,'P':15,'Q':16,'R':17,'S':18,'T':19,'U':20,'V':21,'W':22,'X':23,'Y':24,'Z':25}
    sorted_ord_5 = sorted(df.ord_5.dropna().unique())
    ord_5_mapping = dict(zip(sorted_ord_5, range(len(sorted_ord_5))))

    df['bin_3'] = df.loc[df.bin_3.notnull(), 'bin_3'].map(bin_3_mapping)
    df['bin_4'] = df.loc[df.bin_4.notnull(), 'bin_4'].map(bin_4_mapping)
    df['nom_0'] = df.loc[df.nom_0.notnull(), 'nom_0'].map(nom_0_mapping)
    df['nom_1'] = df.loc[df.nom_1.notnull(), 'nom_1'].map(nom_1_mapping)
    df['nom_2'] = df.loc[df.nom_2.notnull(), 'nom_2'].map(nom_2_mapping)
    df['nom_3'] = df.loc[df.nom_3.notnull(), 'nom_3'].map(nom_3_mapping)
    df['nom_4'] = df.loc[df.nom_4.notnull(), 'nom_4'].map(nom_4_mapping)
    df['nom_5'] = df.loc[df.nom_5.notnull(), 'nom_5'].map(nom_5_mapping)
    df['nom_6'] = df.loc[df.nom_6.notnull(), 'nom_6'].map(nom_6_mapping)
    df['nom_7'] = df.loc[df.nom_7.notnull(), 'nom_7'].map(nom_7_mapping)
    df['nom_8'] = df.loc[df.nom_8.notnull(), 'nom_8'].map(nom_8_mapping)
    df['nom_9'] = df.loc[df.nom_9.notnull(), 'nom_9'].map(nom_9_mapping)
    df['ord_1'] = df.loc[df.ord_1.notnull(), 'ord_1'].map(ord_1_mapping)
    df['ord_2'] = df.loc[df.ord_2.notnull(), 'ord_2'].map(ord_2_mapping)
    df['ord_3'] = df.loc[df.ord_3.notnull(), 'ord_3'].map(ord_3_mapping)
    df['ord_4'] = df.loc[df.ord_4.notnull(), 'ord_4'].map(ord_4_mapping)
    df['ord_5'] = df.loc[df.ord_5.notnull(), 'ord_5'].map(ord_5_mapping)
    
    return df

In [4]:
features = [feat for feat in train.columns if feat not in ['id','target']]

for col in features:
    train_unique_values = set(train[col].dropna().unique())
    test_unique_values  = set(test[col].dropna().unique())

    symmetric_difference_values = train_unique_values.symmetric_difference(test_unique_values)
    if symmetric_difference_values:
        print(f'{len(symmetric_difference_values)} values in {col}, {symmetric_difference_values} Replaced with nan')
        data.loc[data[col].isin(symmetric_difference_values), col] = np.nan

1 values in nom_5, {'b3ad70fcb'} Replaced with nan
4 values in nom_6, {'ee6983c6d', 'a885aacec', '3a121fefb', 'f0732a795'} Replaced with nan
2 values in nom_9, {'3d19cd31d', '1065f10dd'} Replaced with nan


In [5]:
data = data.fillna('miiissing')
train = data[data.target != -1].reset_index(drop=True)
test  = data[data.target == -1].reset_index(drop=True)

# train, valid = train_test_split(train, test_size=0.20, random_state=42)

In [6]:
for col in features:
    mapping = pd.DataFrame(pd.crosstab(train[col], train.target, normalize='index')[1]).to_dict()[1]
    train[col] = train[col].map(mapping)
    #valid[col] = valid[col].map(mapping)
    test[col] = test[col].map(mapping)

In [None]:
score_train = train[features].mean(axis=1)
score_valid = valid[features].mean(axis=1)

In [None]:
print('score train : '+ str(roc_auc_score(train.target.values, score_train.values)))
print('score valid : '+ str(roc_auc_score(valid.target.values, score_valid.values)))

In [None]:
score_train = train[features].product(axis=1)
score_valid = valid[features].product(axis=1)

In [None]:
print('score train : '+ str(roc_auc_score(train.target.values, score_train.values)))
print('score valid : '+ str(roc_auc_score(valid.target.values, score_valid.values)))

In [31]:
N_Splits = 10
SEED = 5
target = 'target'

In [17]:
oof_pred = np.zeros((len(train), ))
y_pred = np.zeros((len(test),))
X_test = test.copy()

skf = StratifiedKFold(n_splits=N_Splits, shuffle=True, random_state=SEED)
for fold, (tr_ind, val_ind) in tqdm_notebook(enumerate(skf.split(train, train[target]))):
    X_train, X_val = train[features].iloc[tr_ind], train[features].iloc[val_ind]
    y_train, y_val = train[target].iloc[tr_ind], train[target].iloc[val_ind]
    for col in features:
        mapping = pd.DataFrame(pd.crosstab(X_train[col], y_train, normalize='index')[1]).to_dict()[1]
        X_val[col] = X_val[col].map(mapping)
        X_test[col] = X_test[col].map(mapping)
    val_pred = X_val[features].product(axis=1)
    oof_pred[val_ind] = val_pred.ravel()
    y_pred += X_test[features].product(axis=1).ravel() / (N_Splits)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [18]:
print('score oof : '+ str(roc_auc_score(train.target.values, oof_pred)))

score oof : 0.7843426241522173


In [32]:
oof_pred = np.zeros((len(train), ))
y_pred = np.zeros((len(test),))

skf = StratifiedKFold(n_splits=N_Splits, shuffle=True, random_state=SEED)
for fold, (tr_ind, val_ind) in tqdm_notebook(enumerate(skf.split(train, train[target]))):
    X_test = test.copy()
    X_train, X_val = train[features].iloc[tr_ind], train[features].iloc[val_ind]
    y_train, y_val = train[target].iloc[tr_ind], train[target].iloc[val_ind]
    for col in features:
        mapping = pd.DataFrame(pd.crosstab(X_train[col], y_train, normalize='index')[1]).to_dict()[1]
        X_train[col] = X_train[col].map(mapping)
        X_val[col] = X_val[col].map(lambda x: mapping[x] if x in mapping.keys() else next(iter(mapping.values())))
        X_test[col] = X_test[col].map(lambda x: mapping[x] if x in mapping.keys() else next(iter(mapping.values())))
    lr = LogisticRegression(max_iter=300)
    lr.fit(X_train, y_train)
    val_pred = lr.predict_proba(X_val)[:,1]
    oof_pred[val_ind] = val_pred.ravel()
    y_pred += lr.predict_proba(X_test[features])[:,1].ravel() / (N_Splits)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [33]:
print('score oof : '+ str(roc_auc_score(train.target.values, oof_pred)))

score oof : 0.7849254570510114
