In [None]:
#copy and edit from:
#https://www.kaggle.com/davidcoxon/first-look-at-october-data
#https://www.kaggle.com/mohammadkashifunique/tsp-single-xgboost-model

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc, RocCurveDisplay


## Get Data

In [None]:
%%time
train_1 = pd.read_csv('../input/tabular-playground-series-oct-2021/train.csv',index_col='id')
test_1 = pd.read_csv('../input/tabular-playground-series-oct-2021/test.csv',index_col='id')
print('import data')

## Some Overview

In [None]:
print(f'train data shape:{train_1.shape},test data shape:{test_1.shape}')

In [None]:
print(f'train data type:{train_1.dtypes.unique()},test data type{test_1.dtypes.unique()}')

In [None]:
def reduce_mem_usage(df, verbose=True):
    # form byte to MB
    start_mem = df.memory_usage().sum() / 1024 **2
    
    for col in df.columns:
        col_dtype = df[col].dtypes
        
        #check type's upperbound and lowerbound
        col_min = df[col].min()
        col_max = df[col].max()
        
        #try to change type so we can save space
        if str(col_dtype) == 'int64':
            if col_min > np.iinfo(np.int8).min and col_max < np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
            elif col_min > np.iinfo(np.int16).min and col_max < np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
            elif col_min > np.iinfo(np.int32).min and col_max < np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
            elif col_min > np.iinfo(np.int64).min and col_max < np.iinfo(np.int64).max:
                df[col] = df[col].astype(np.int64)
        else:
            if col_min > np.finfo(np.float16).min and col_max < np.finfo(np.float16).max:
                df[col] = df[col].astype(np.float16)
            elif col_min > np.finfo(np.float32).min and col_max < np.finfo(np.float32).max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(f'Memory usage decreased to {end_mem:.2f} MB{(start_mem - end_mem) / start_mem : .2f}% reduction')
    return df

In [None]:
%%time

train_1 = reduce_mem_usage(train_1,verbose=True)
test_1 = reduce_mem_usage(test_1,verbose=True)

print(f'train dtypes are:{train_1.dtypes.unique()},test dtypes are:{test_1.dtypes.unique()}')

In [None]:
train_1.head()

In [None]:
test_1.head()

In [None]:
train_missing = train_1.isnull().values.sum()
test_missing = test_1.isnull().values.sum()
print(f'train missing data: {train_missing}({train_missing/train_1.shape[0] :.2f}%)')
print(f'test missing data: {test_missing}({test_missing/test_1.shape[0] :.2f}%)')

In [None]:


categorical_features = []
continuous_features = []

for col in train_1.columns:
    if train_1[col].dtypes == 'int8':
        categorical_features.append(col)
    else:
        continuous_features.append(col)
sns.barplot(x=[0,1],y=[len(categorical_features),len(continuous_features)])
plt.xticks([0,1],('categorical','continuous'))
plt.show()

## Distribution

In [None]:
#drop target column
categorical_features.pop()

In [None]:

train_outlier = train_1[categorical_features]
test_outlier = test_1[categorical_features]


fig = plt.figure(figsize=(25,50))
sns.set_style("darkgrid")
for idx, col_name in enumerate(train_outlier.columns):
    fig.add_subplot(len(train_outlier.columns)//4 + 1, 4, idx+1)
    sns.histplot(data=train_outlier.iloc[:,idx],color='r',bins=2)
    sns.histplot(data=test_outlier.iloc[:,idx],color='b',bins=2)
    plt.xlabel('')
    plt.ylabel('')
    plt.title(col_name)
plt.show()
    

In [None]:

corr=train_1[categorical_features].corr()
sns.set_style('white')

mask = np.triu(np.ones_like(corr, dtype = bool)) #True position will not show up
plt.figure(figsize = (15, 15))
plt.title('Correlation matrix for categorigal features of Training data')
sns.heatmap(corr,cmap='coolwarm', mask = mask,annot=False, linewidths =0.5,square=True,cbar_kws={"shrink": 0.40})
plt.show()

In [None]:
train_outlier = train_1[continuous_features]
test_outlier = test_1[continuous_features]


fig = plt.figure(figsize=(40,140))
sns.set_style("darkgrid")
for idx, col_name in enumerate(train_outlier.columns):
    fig.add_subplot(len(train_outlier.columns)//4 + 1, 4, idx+1)
    sns.histplot(data=train_outlier.iloc[:,idx],color='r',bins=20)
    sns.histplot(data=test_outlier.iloc[:,idx],color='b',bins=20)
    plt.xlabel('')
    plt.ylabel('')
    plt.title(col_name)
plt.show()
    

In [None]:
corrs = train_1[continuous_features].corr()

sns.set_style('white')
mask = np.triu(np.ones_like(corrs, dtype = bool)) #True position will not show up
plt.figure(figsize = (15, 15))
plt.title('Correlation matrix for categorigal features of Training data')
sns.heatmap(corrs,cmap='coolwarm', mask = mask,annot=False, linewidths =0.5,square=True,cbar_kws={"shrink": 0.40})
plt.show()

#### There are too many features.  
#### So just pick up features that realtive high corr.

In [None]:
corrs = train_1[continuous_features].corr().abs()
high_corr = np.where(corrs>0.03)
high_corr_col = []
for x, y in zip(high_corr[0],high_corr[1]):
    if x != y and x < y:
        high_corr_col.append(corrs.columns[x])
        high_corr_col.append(corrs.columns[y])
sig_corrs = train_1[list(set(high_corr_col))].corr()

sns.set_style('white')
mask = np.triu(np.ones_like(sig_corrs, dtype = bool)) #True position will not show up
plt.figure(figsize = (15, 15))
plt.title('Correlation matrix for categorigal features of Training data')
sns.heatmap(sig_corrs,cmap='coolwarm', mask = mask,annot=False, linewidths =0.5,square=True,cbar_kws={"shrink": 0.40})
plt.show()

In [None]:
X = train_1.drop(columns='target')
y = train_1.target
X_test = test_1.copy()

del train_1
del test_1

In [None]:
print(f'train size :{X.shape}, test size {X_test.shape}')

## Add some Features

In [None]:
X['mean'] = X.mean(axis=1)
X['std'] = X.std(axis=1)
X['max'] = X.max(axis=1)

X_test['mean'] = X_test.mean(axis=1)
X_test['std'] = X_test.std(axis=1)
X_test['max'] = X_test.max(axis=1)

In [None]:
%%time
k_fold = StratifiedKFold(n_splits=5, shuffle=True,random_state=42)

scores = []
test_predictions = []


for fold ,(train_index,valid_index) in enumerate(k_fold.split(X,y)):
    X_train, y_train = X.iloc[train_index], y.iloc[train_index]
    X_valid, y_valid = X.iloc[valid_index], y.iloc[valid_index]
    
    model = LogisticRegression(solver='liblinear',max_iter=600)
    
    model.fit(X_train,y_train)
    valid_pred = model.predict_proba(X_valid)[:,1]#binary so it's fine.
    fpr, tpr,_ = roc_curve(y_valid, valid_pred)
    score = auc(fpr,tpr)
    scores.append(score)
    display = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=score)
    display.plot()
    
    
    print(f'\n Fold:{fold + 1},Score:{score}')
    print('-'*50)
    
    test_preds = model.predict_proba(X_test)[:,1]
    test_predictions.append(test_preds)
print(f'Overall Valid Score:{np.mean(scores)}')

## Save and Check

In [None]:
predic = np.mean(np.column_stack(test_predictions),axis=1)
ss = pd.read_csv('../input/tabular-playground-series-oct-2021/sample_submission.csv')
ss['target'] = predic
ss.to_csv('submission.csv',index=False)
ss.head()