In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
%time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

        
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%%time
#Plotting Tools
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
%matplotlib inline

import time
import random
import datetime
import warnings
import gc
warnings.filterwarnings("ignore")

# Tools
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score ,accuracy_score ,mean_absolute_error,mean_squared_error
from sklearn.model_selection import StratifiedKFold , KFold ,train_test_split
from sklearn.ensemble import VotingClassifier

#Models
import optuna
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier


# Import Data 

In [None]:
%%time
filename = '/kaggle/input/tabular-playground-series-oct-2021/train.csv'
n = sum(1 for line in open(filename)) - 1 #number of records in file (excludes header)
s = 500000 #desired sample size
skip = sorted(random.sample(range(1,n+1),n-s))

In [None]:
%%time
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-oct-2021/train.csv' ,skiprows=skip)
train_df.head()

In [None]:
%%time
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-oct-2021/test.csv')
test_df.head()

In [None]:
%%time
def memory_reduce(dataframe):
    """
    Use this function to reduce size of dataframe in memory
    before use = make sure you handeled all null vaues  in dataframe
    input = [dataframe]
    output= [dataframe]
    """
    start_time = datetime.datetime.now()
    
    if sum(dataframe.isnull().sum()) > 0 : 
        print("DataFrame could not have null Values ,Please Fill Null values before use this function")
    else:  
        init_momry_size = dataframe.memory_usage(deep =True).sum() /1024 ** 2
#         print(f'Memory Size of data  is equal to  {init_momry_size} MB')  # Memory Size Before apply Function 

        for column in dataframe.columns :
            NA_coulmns = []
            if dataframe[column].dtype != "object":
#                 print(f'******************** Column: {column} *************************')
#                 print(f'Column  data type : {dataframe[column].dtype} \n')
                is_int = False
                column_max_value = dataframe[column].max()
                column_min_value = dataframe[column].min()
                
                as_int = dataframe[column].fillna(0).astype(np.int64)
                result = (dataframe[column] - as_int)
                result = result.sum()
                if  result > -0.01 and result < 0.01:
                    is_int =True
                # if column value is int   
                if is_int:
                    if column_min_value >=0:
                        if column_max_value < 255:
                            dataframe[column] = dataframe[column].astype(np.uint8)
                        elif column_max_value < 65536 : 
                            dataframe[column]= dataframe[column].astype(np.uint16)
                        elif column_max_value < 4294967295:
                            dataframe[column] = dataframe[column].astype(np.uint32)
                        else:
                            dataframe[column] = dataframe[column].astype(np.uint64)
                    else:
                        if column_min_value > np.iinfo(np.int8).min and column_max_value < np.iinfo(np.int8).max:
                            dataframe[column] = dataframe[column].astype(np.int8)
                        elif column_min_value > np.iinfo(np.int16).min and column_max_value < np.iinfo(np.int16).max:
                            dataframe[column] = dataframe[column].astype(np.int16)
                        elif column_min_value > np.iinfo(np.int32).min and column_max_value < np.iinfo(np.int32).max:
                            dataframe[column] = dataframe[column].astype(np.int32)
                        elif column_min_value > np.iinfo(np.int64).min and column_max_value < np.iinfo(np.int64).max:
                            dataframe[column] = dataframe[column].astype(np.int64)
                # If coulmn Value is float
                else:
                    dataframe[column] = dataframe[column].astype(np.float32)
#                 print(f'column data type changed to {dataframe[column].dtype}')
#                 print('************************************************************ \n')
    final_memory_size = dataframe.memory_usage(deep =True).sum() /1024 ** 2

    end_time = datetime.datetime.now()
    
    print(f'Memory Size of data before applying function to DateFrame: {init_momry_size} MB')
    print(f'memory size changed after applying function to DataFrame : {final_memory_size} MB')
    print(f'memory size has been optimized {round((final_memory_size / init_momry_size) * 100 ,2) } %')
    print("\n")
    
    print('**************************************************')
    print(f'Task started at: {start_time}')
    print(f'Task ended at:   {end_time}')
    print(f'Task total time: {end_time - start_time}')
    print('\n')
    
    return dataframe

In [None]:
%%time
train_df  = memory_reduce(train_df)

In [None]:
%%time
test_df =memory_reduce(test_df)

# Analayze Data

In [None]:
%%time
train_df.shape , test_df.shape

In [None]:
%%time
train_df.isnull().sum()

In [None]:
%%time
train_df.info()

In [None]:
%%time
train_df.describe().T

# Data Preprocessing

In [None]:
%%time
Y = train_df.target
X = train_df.drop(['id','target'],axis=1)
X.head()

In [None]:
%%time
Y.head()

In [None]:
id_test = test_df.id
id_test.head()

In [None]:
%%time
X_test = test_df.drop("id",axis=1)
X_test.head()

In [None]:
%%time
del train_df,test_df

In [None]:
gc.collect()

In [None]:
%%time
features = X.columns.to_list()
print(features,end='')
print("\n")

In [None]:
%%time
def get_categorical_columns(dataframe,verbose=True):
    """
    Use this function to retun categorical columns of a dataframe
    input : dataframe
    output: list 
    
    """
    categorical_columns = []
    for column in dataframe.columns :
        if dataframe.dtypes[column] in ['int8' ,'int16',"int32" ,"int64" ,'uint8']:
            categorical_columns.append(column)
    print(f' There are {len(categorical_columns)} categorical columns in Dataframe')
    print(categorical_columns,end='')
    print("\n")
    return categorical_columns

In [None]:
%%time
categorical_columns = get_categorical_columns(X)

In [None]:
%%time

def get_continuous_columns(dataframe,verbose=True):
    """
    Use this function to retun countinous  columns of a dataframe
    input : dataframe
    output: list 
    
    """
    
    continuos_column = []
    for column in dataframe.columns:
        if dataframe.dtypes[column] in ['float16' ,'float32' ,'float64']:
            continuos_column.append(column)
    print(f'There are {len(continuos_column)} continous  columns  in Datframe')
    print(continuos_column,end="")
    print("\n")
    return continuos_column

In [None]:
%%time 
continous_columns = get_continuous_columns(X)

In [None]:
%%time
X = memory_reduce(X)

In [None]:
%%time
X_test= memory_reduce(X_test)

# Exploratory Data Analysis

The is comes from this notebook : 
https://www.kaggle.com/davidcoxon/first-look-at-october-data

In [None]:
%%time
sns.histplot(data=Y,palette="viridis",bins=10 , color ="green")

In [None]:
%%time
target_distrbution  = pd.DataFrame(Y.value_counts() / len(Y))
target_distrbution.T

In [None]:
%%time
sns.countplot(Y ,palette="Set3")

In [None]:
%%time
plt.bar([1,2], [len(categorical_columns) ,len(continous_columns)])
plt.xticks([1,2] , ['categorical' , 'continous'])
plt.show()

Distruburion of catecorical columns

In [None]:
%time
start_time  =datetime.datetime.now()
sns.set_style("whitegrid")
nrows = (len(categorical_columns) // 6) + 1
size_y = nrows * 4
fig ,axes =  plt.subplots(nrows=nrows,ncols=6,figsize=( 30, size_y ))

counter = 0
for i in range (1, nrows+1):
    for j in range (1,7):
        if counter >= len(categorical_columns):
            break
        else:
            subchart = sns.histplot(data =X[categorical_columns] , x = str(X[categorical_columns].columns[counter]),
                                    ax= axes[(i-1),(j-1)] ,color = "green",label = "Train" ,bins=2)
            subchart = sns.histplot(data =X_test[categorical_columns]  , x = str(X_test[categorical_columns].columns[counter]),
                                    ax = axes[(i-1),(j-1)] ,color = "red",label = "test" ,bins=2)
            counter += 1
    print(f'row {i} out of {nrows} has been plotted') #print this line to monitor progress of plottin you can comment this line
end_time = datetime.datetime.now()

print('\n')
print('**************************************************')
print(f'Plotting Started at : {start_time}')
print(f'Plotting ended at : {end_time}')
print(f'total plotting time is : {end_time - start_time}')

Distrubution of Continous columns 

In [None]:
%time
start_time  =datetime.datetime.now()
sns.set_style("whitegrid")
nrows = (len(continous_columns) // 6) + 1
size_y = nrows * 4
fig ,axes =  plt.subplots(nrows=nrows,ncols=6,figsize=( 30, size_y ))

counter = 0
for i in range (1, nrows+1):
    for j in range (1,7):
        if counter >= len(continous_columns):
            break
        else:
            subchart = sns.histplot(data =X[continous_columns] , x = str(X[continous_columns].columns[counter]),ax= axes[(i-1),(j-1)] ,color = "blue",label = "Train",bins=2)
            subchart = sns.histplot(data =X_test[continous_columns]  , x = str(X_test[continous_columns].columns[counter]),ax = axes[(i-1),(j-1)] ,color = "red",label = "test",bins=2)
            counter += 1
    print(f'row {i} out of {nrows} has been plotted')
end_time = datetime.datetime.now()

print('\n')
print('**************************************************')
print(f'Plotting Started at : {start_time}')
print(f'Plotting  ended at :  {end_time}')
print(f'total plotting time is : {end_time - start_time}')

High Correlation Features

In [None]:
%time
matrix = np.triu(X[categorical_columns].corr())
plt.figure(figsize=(20 ,10))
sns.heatmap(X[categorical_columns].corr() , annot= False , cmap="coolwarm" , mask=matrix , linecolor="white" ,cbar=True ,linewidths=0.1)
plt.title('Categorical Corrlation Matrix')
plt.show()

In [None]:
%time
start_time = datetime.datetime.now()

corr_df = X[continous_columns].corr().abs()
high_corr = np.where(corr_df >0.02)
high_corr = [(corr_df.columns[x] , corr_df.columns[y]) for x,y  in zip (*high_corr) if x !=y and x >y]

end_time = datetime.datetime.now()

print('\n')
print('**************************************************')
print(f'Task started at: {start_time}')
print(f'Task ended at:   {end_time}')
print(f'Task total time: {end_time - start_time}')

In [None]:
%time
start_time = datetime.datetime.now()

high_corr_features=[]
for x in high_corr:
    for item in x:
        if item not in high_corr_features:
            high_corr_features.append(item)
matrix = np.triu(X[high_corr_features].corr())
plt.figure(figsize=(20 ,10))
sns.heatmap(X[high_corr_features].corr() , annot= False , cmap="coolwarm" , mask=matrix , linecolor="white" ,cbar=True ,linewidths=0.1)
plt.title('High corraltion  for continous columns matrix')
plt.show()

end_time = datetime.datetime.now()

print('\n')
print('**************************************************')
print(f'Task started at: {start_time}')
print(f'Task ended at:   {end_time}')
print(f'Task total time: {end_time - start_time}')

In [None]:
%time
start_time = datetime.datetime.now()

corr_df = X.corr().abs()
high_corr = np.where(corr_df >0.02)
high_corr = [(corr_df.columns[x] , corr_df.columns[y]) for x,y  in zip (*high_corr) if x !=y and x >y]

end_time = datetime.datetime.now()

print('\n')
print('**************************************************')
print(f'Task started at: {start_time}')
print(f'Task ended at:   {end_time}')
print(f'Task total time: {end_time - start_time}')

In [None]:
%time
start_time = datetime.datetime.now()

high_corr_features=[]
for x in high_corr:
    for item in x:
        if item not in high_corr_features:
            high_corr_features.append(item)
matrix = np.triu(X[high_corr_features].corr())
plt.figure(figsize=(20 ,10))
sns.heatmap(X[high_corr_features].corr() , annot= False , cmap="coolwarm" , mask=matrix , linecolor="white" ,cbar=True ,linewidths=0.1)
plt.title('High corraltion  for  all features matrix')
plt.show()

end_time = datetime.datetime.now()

print('\n')
print('**************************************************')
print(f'Task started at: {start_time}')
print(f'Task ended at:   {end_time}')
print(f'Task total time: {end_time - start_time}')

# Prediction Prepration

In [None]:
%%time
gc.collect()

In [None]:
%%time
start_time = datetime.datetime.now()

X["n_missing"] = X[features].isna().sum(axis=1)
X_test["n_missing"] = X_test[features].isna().sum(axis=1)

X["std"] = X[features].std(axis=1)
X_test["std"] = X_test[features].std(axis=1)

features += ['n_missing' , 'std']

end_time = datetime.datetime.now()

print('**************************************************')
print(f'Task started at: {start_time}')
print(f'Task ended at:   {end_time}')
print(f'Task total time: {end_time - start_time}')
print('\n')

In [None]:
%%time

start_time = datetime.datetime.now()

simple_imputer = SimpleImputer(strategy="mean")

X[features] = simple_imputer.fit_transform(X[features])
X_test[features] = simple_imputer.transform(X_test[features])


end_time = datetime.datetime.now()

print('**************************************************')
print(f'Task started at: {start_time}')
print(f'Task ended at:   {end_time}')
print(f'Task total time: {end_time - start_time}')
print('\n')

In [None]:
gc.collect()

In [None]:
%%time
start_time = datetime.datetime.now()

standard_scaler = StandardScaler()
X[features] = standard_scaler.fit_transform(X[features])
X_test[features] = standard_scaler.transform(X_test[features])

end_time = datetime.datetime.now()
print('**************************************************')
print(f'Task started at: {start_time}')
print(f'Task ended at:   {end_time}')
print(f'Task total time: {end_time - start_time}')
print('\n')

# Optuna 

In [None]:
%%time
def objective(trial , data=X, target= Y):    
    params ={"max_depth" :trial.suggest_int("max_depth" ,2,8) , 
          "learning_rate" : trial.suggest_float("learning_rate" , 0.005 , 0.2),
          "n_estimators" : trial.suggest_int("n_estimators" , 1000 ,5000),
          "min_child_weight" : trial.suggest_int("min_child_weight" , 1,500),
          "gamma" : trial.suggest_float("gamma" ,0.0001 , 1.0 , log = True),
          "alpha": trial.suggest_float("alpha" , 0.0001 , 10 ,log = True),
          "lambda": trial.suggest_float("lambda" ,0.0001, 10.0 , log = True),
          "colsample_bytree": trial.suggest_float("colsample_bytree" , 0.1 , 0.8), 
          "subsample": trial.suggest_float("subsample" , 0.1,0.9),
          "tree_method" : "gpu_hist",
          "booster" : "gbtree",
           "random_state": 228 ,
           "use_label_encoder" : False,
           "eval_metric" : "auc"
          }
    model = XGBClassifier(**params)
    scores = []
    K = StratifiedKFold(n_splits=4,random_state=228 , shuffle=True)
    for i ,(train_idx , val_idx) in enumerate(K.split(X,Y)):
        X_train ,X_val = X.iloc[train_idx],X.iloc[val_idx]
        Y_train ,Y_val = Y.iloc[train_idx],Y.iloc[val_idx]
        model.fit(X_train ,Y_train ,eval_set = [(X_val,Y_val)] ,early_stopping_rounds =300 ,verbose = False)
        
        train_prediction = model.predict_proba(X_train)[:,1]
        train_score = roc_auc_score(Y_train ,train_prediction)
        
        validate_prediction = model.predict_proba(X_val)[:,1]
        validate_score = roc_auc_score(Y_val , validate_prediction)
        scores.append((train_score , validate_score))
        
        print(f"Fold {i+1} | AUC : {validate_score} ")
        
    scores = pd.DataFrame(scores ,columns=["train Score" , "Validation Score"])
    return scores["Validation Score"].mean()

In [None]:
%%time
start_time = datetime.datetime.now()


study = optuna.create_study(direction="maximize")
study.optimize(objective ,n_trials= 50)

print('\n')
print('*************************************************')
print("Numbers of finished trials : " , len(study.trials))
print("Best Trials : ", study.best_trial.params)
print("Best Values : " , study.best_value)

end_time = datetime.datetime.now()
print('\n')
print('**************************************************')
print(f'Task started at: {start_time}')
print(f'Task ended at:   {end_time}')
print(f'Task total time: {end_time - start_time}')
print('\n')

# XG_boost

In [None]:
%%time
xgb_params = study.best_trial.params
xgb_params

In [None]:
gc.collect()

In [None]:
%%time 
start_time = datetime.datetime.now()

folds = StratifiedKFold(n_splits=5,random_state=228,shuffle=True)
predictions = np.zeros(len(X_test))
for fold,(train_idx,validate_idx) in enumerate(folds.split(X,Y)):
    X_train,X_validate = X.iloc[train_idx] ,X.iloc[validate_idx]
    Y_train,Y_validate = Y.iloc[train_idx] ,Y.iloc[validate_idx]
    xgb_model = XGBClassifier(**xgb_params ,tree_method= "gpu_hist",booster = "gbtree" ,random_state = 228,use_label_encoder = False ,eval_metric = "auc")
    xgb_model.fit(X_train,Y_train,eval_set = [(X_validate,Y_validate)],verbose =False,early_stopping_rounds =300)
    predictions += xgb_model.predict_proba(X_test)[:,1] /folds.n_splits
    print(f'fold {fold} completed')
    
end_time = datetime.datetime.now()

print('\n')
print('**************************************************')
print(f'Task started at: {start_time}')
print(f'Task ended at:   {end_time}')
print(f'Task total time: {end_time - start_time}')
print('\n')

In [None]:
gc.collect()

In [None]:
%%time

submit = pd.DataFrame({"id":id_test , "target": predictions})
submit.to_csv("/kaggle/working/xgb_submit.csv",index=False)