In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# import all library
import warnings
warnings.simplefilter(action='ignore')

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import metrics
from sklearn import tree

In [None]:
def create_fold(data):
    # create extra column
    data['kfold'] = -1
    
    # randomize data
    data = data.sample(frac = 1).reset_index(drop = True)
    
    # stratified k fold initialization
    kf = model_selection.StratifiedKFold(n_splits = 6)
    
    # assign fold number to kfold
    for fold , (t_ , v_) in enumerate(kf.split(X = data , y = data.target.values)):
        data.loc[v_  , 'kfold'] = fold
    
    return data

In [None]:
def run(data , fold):
    
    # except for target , id and kfold all are features
    features = [ i for i in data.columns if i not in ('kfold' , 'id' , 'target') ]
    
    # first let's try simple linear model , for linear model we need one-hot encoder
    # all the features are categorical so let's fill na with NONE
    # i'm converting all columns into string cause everything is categorical so it doesn't matter
    for col in features:
        data.loc[:,col] = data[col].astype(str).fillna("NONE")
    
    df_train = data[data.kfold != fold].reset_index(drop = True)
    df_val = data[data.kfold == fold].reset_index(drop = True)
    
    # now let's do one hot
    one_hot = preprocessing.OneHotEncoder(sparse = True)
    
    # reason behind this is to handle rare data at validation time
    full_data = pd.concat([ df_train[features] , df_val[features] ] , axis = 0)
    
    one_hot.fit(full_data[features])
    
    X_train = one_hot.transform(df_train[features])
    X_val = one_hot.transform(df_val[features])
    
    # Logistic 
    model = linear_model.LogisticRegression(solver = 'liblinear')
    model.fit(X_train , df_train.target.values)
    y_pred = model.predict_proba(X_val)[:,1]
    
    # we'll use AUC score cause data is skewed
    auc = metrics.roc_auc_score(df_val.target.values , y_pred)
    
    print(f"AUC score : {auc}")

In [None]:
def data_visualize(data):
    plt.figure(figsize = (5,5))
    sns.countplot(data.target)
    plt.xlabel('target' ,fontsize = 20)
    plt.ylabel('count' , fontsize = 20)

In [None]:
# if __name__ == '__main__':
#     df = pd.read_csv('/kaggle/input/cat-in-the-dat-ii/train.csv')
    
#     data_visualize(df)
    
#     # data is skewed so we should go for stratified k-fold
#     data = create_fold(df)
    
#     for fold_ in range(6):
#         run(data , fold_)
        
    # df.kfold.value_counts()
    # let's see target distribution in each folds , it's almost same
    # for fold in range(6):
    #     print(df[df.kfold == fold].target.value_counts())

In [None]:
# now let's use some tree based algo. and select one which is better to evaluate test set
def run_decision(data , fold):
    
    features = [i for i in data.columns if i not in ('target' , 'kfold' , 'id')]
    
    for col in features:
        data.loc[:,col] = data[col].astype(str).fillna("NONE")
    
    for col in features:
        lbl = preprocessing.LabelEncoder()
        lbl.fit(data[col])
        data.loc[:,col] = lbl.transform(data[col])
    
    data_train = data[data.kfold != fold].reset_index(drop = True)
    data_val = data[data.kfold == fold].reset_index(drop = True)
    
    model = tree.DecisionTreeClassifier()
    model.fit(data_train[features] , data_train.target.values)
    pred = model.predict(data_val[features])
    
    AUC = metrics.roc_auc_score(data_val.target , pred)
    print(f"AUC : {AUC}")

In [None]:
# if __name__ == '__main__':
#     df = pd.read_csv('/kaggle/input/cat-in-the-dat-ii/train.csv')
    
#     data_visualize(df)
    
#     # data is skewed so we should go for stratified k-fold
#     data = create_fold(df)
    
#     for fold_ in range(6):
#         run_decision(data , fold_)

In [None]:
# X = pd.read_csv('/kaggle/input/cat-in-the-dat-ii/sample_submission.csv')
# X.head()

In [None]:
# so after implementing 2 algorithm we can see that logistic has more accuracy then decisiontree
data_test = pd.read_csv('/kaggle/input/cat-in-the-dat-ii/test.csv')
data_train = pd.read_csv('/kaggle/input/cat-in-the-dat-ii/train.csv')

features = [i for i in data_train.columns if i not in ('id' , 'target')]

for col in features:
    data_train.loc[:,col] = data_train[col].astype(str).fillna('NONE')
    data_test.loc[:,col] = data_test[col].astype(str).fillna('NONE')

one_hot = preprocessing.OneHotEncoder(sparse = True)

full_data = pd.concat( [data_train[features] , data_test[features]] , axis = 0)

one_hot.fit(full_data[features])

X_train = one_hot.transform(data_train[features])
X_test = one_hot.transform(data_test[features])

model = linear_model.LogisticRegression(solver = 'liblinear')
model.fit(X_train , data_train.target)
answer = model.predict_proba(X_test)[:,1]

In [None]:
def true_negative(y_true , y_pred):
    cnt = 0
    for y_t , y_p in zip(y_true , y_pred):
        if y_t == 0 and y_p == 0:
            cnt += 1
    return cnt

def false_positive(y_true , y_pred):
    cnt = 0
    for y_t , y_p in zip(y_true , y_pred):
        if y_t == 0 and y_p == 1:
            cnt += 1
    return cnt

def tpr(y_true , y_pred):
    return metrics.recall_score(y_true , y_pred)

def fpr(y_true , y_pred):
    FP = false_positive(y_true , y_pred)
    TN = true_negative(y_true , y_pred)
    return FP / (FP + TN)

In [None]:

y_train = model.predict_proba(X_train)[:,1]
threshold = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.85, 0.9, 0.99, 1.0]
fpr_l = []
tpr_l = []
for th in threshold:
    temp = [1 if i>=th else 0 for i in y_train]
    fpr_l.append(fpr(data_train.target , temp))
    tpr_l.append(tpr(data_train.target , temp))
    
# y_train = y_train >= 0.5
# print(metrics.roc_auc_score(data_train.target , y_train))

plt.figure(figsize = (10,10))
plt.fill_between(fpr_l , tpr_l , alpha = 0.4)
plt.xlim(0,1.0)
plt.ylim(0,1.0)
plt.xlabel('FPR' , fontsize=15)
plt.ylabel('TPR' , fontsize=15)
plt.plot(fpr_l,tpr_l)
plt.show()

In [None]:
# y_train = model.predict_proba(X_train)[:,1]

# y_train = y_train >= 0.4

# print(metrics.accuracy_score(data_train.target , y_train))

In [None]:
# print(threshold)
# print(fpr_l)
# print(tpr_l)

In [None]:
ans = answer >= 0.2
# print(ans.astype(int))

my_sub = pd.DataFrame({
    'id' : data_test.id,
    'target' : ans.astype(int)
})

In [None]:
my_sub.to_csv('my_submission.csv' , index = False)