# [Categorical Feature Encoding Challenge II  ](http://https://www.kaggle.com/c/cat-in-the-dat-ii)

Author: Raquel Aoki

This is a binary classification problem that uses a large dataset made of categorical features. 
Based on the problem "Cat in the Dat". 

My pipeline is: 
1. Join training and testing set to transform the features from categorical to dummies;
2. Split datatsets again;
3. Test several models (SVM, Random Forest, NN, Logist Regression)
4. Compare their results 

In [None]:
import warnings
warnings.filterwarnings("ignore")

import os
import pandas as pd
import numpy as np
from sklearn import metrics, preprocessing
from sklearn.metrics import confusion_matrix
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
import tensorflow as tf
from tensorflow import keras
    
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
'''Loading files'''
#A subsample was used to speed up the tests
testing = True
if testing: 
    train = pd.read_csv("../input/cat-in-the-dat-ii/train.csv").sample(n=300000, random_state=1)
    test = pd.read_csv("../input/cat-in-the-dat-ii/test.csv")#.sample(n=10000, random_state=1)
    sample = pd.read_csv("../input/cat-in-the-dat-ii/sample_submission.csv")
else: 
    train = pd.read_csv("../input/cat-in-the-dat-ii/train.csv")
    test = pd.read_csv("../input/cat-in-the-dat-ii/test.csv")
    sample = pd.read_csv("../input/cat-in-the-dat-ii/sample_submission.csv")
df_train = train 
df_test = pd.DataFrame(test)
print(train.shape, test.shape)

#Aggregating the features to transform the categorical variables 
df_test["target"] = -1
data = pd.concat([df_train, df_test]).reset_index(drop=True)
print(data.shape ,df_train.shape, df_test.shape)

In [None]:
#transforming the categorical variables in dummies 
did = data['id'].values
dtarget = data['target'].values
data.drop(['id','target'],axis=1,inplace=True)
columns = [i for i in data.columns]
data_new = pd.get_dummies(data,columns=columns,drop_first=True, sparse=True) 
data_new.fillna(0)
del data

#adding back the id and target variables
data_new['id'] = did
data_new['target'] = dtarget
print(data_new.shape)

In [None]:
'''Splitting the train and testing set after the data transformation'''
y = np.array(data_new[data_new.target != -1].reset_index(drop=True).target)
X = data_new[data_new.target != -1].reset_index(drop=True).drop(['target','id'], axis  = 1).to_numpy()
X_ = data_new[data_new.target == -1].reset_index(drop=True).drop(['target','id'], axis  = 1).to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
'''
Model 1: score 0.68 SVM + using a balanced dataset  
Model 2: score 0.66 RF + using the full dataset 
Model 3: score ?? NN using full dataset 
Model 4: score 0.69 
'''
model = 'm5'

if model=='m1': 
    clf = svm.SVC(kernel='rbf',gamma=0.1,C=0.3)
    clf.fit(X_train, y_train)
    y_val = clf.predict(X_test)
    y_train_val = clf.predict(X_train)
    test_preds =  clf.predict(X_)
elif model == 'm2':    
    rf = RandomForestClassifier(n_estimators=500, max_depth=30, class_weight={1: 5}, random_state = 42)
    rf.fit(X_train, y_train)
    y_val = rf.predict(X_test)
    y_train_val = rf.predict(X_train)
    test_preds =  rf.predict(X_)
elif model == 'm3':
    model = keras.Sequential([
        keras.layers.Flatten(input_shape=(X_train.shape[1],)),
        keras.layers.Dense(16, activation=tf.nn.relu),
        keras.layers.Dense(16, activation=tf.nn.relu),
        keras.layers.Dense(1, activation=tf.nn.sigmoid),
    ])

    model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

    model.fit(X_train, y_train, epochs=50, batch_size=1)
    y_val = model.predict(X_test)
    y_train_val = model.predict(X_train)
    aux =  model.predict(X_)
    test_preds = []
    for i in aux: 
        test_preds.append(i[0])
else: 
    lr_cv = LogisticRegressionCV(Cs=7,solver="lbfgs",tol=0.0001,max_iter=3000,cv=3)
    lr_cv.fit(X_train, y_train)
    y_val = lr_cv.predict_proba(X_test)[:, 1]
    y_train_val = lr_cv.predict_proba(X_train)[:, 1]#lr_cv.predict(X_train)
    test_preds =  lr_cv.predict_proba(X_)[:, 1]

In [None]:
'''Evaluation'''
print(confusion_matrix(y_train,y_train_val.astype(int)))
print(confusion_matrix(y_test,y_val.astype(int)))
print("Overall AUC={}".format(metrics.roc_auc_score(y_test, y_val.astype(int))))

In [None]:
#Submission
test_ids = test.id.values
print("Saving submission file")
submission = pd.DataFrame({'id': test_ids,'target': test_preds})
submission.to_csv("submission.csv", index=False)
submission.head()