In [None]:
import pandas            as pd
import numpy             as np
import seaborn           as sns
import matplotlib.pyplot as plt
import seaborn           as sns
import heapq

from sklearn.model_selection   import train_test_split
from sklearn.model_selection   import cross_val_score
from sklearn.model_selection   import GridSearchCV
from sklearn.decomposition     import IncrementalPCA
from sklearn.preprocessing     import LabelEncoder
from sklearn.preprocessing     import RobustScaler,MinMaxScaler,KBinsDiscretizer
from sklearn.pipeline          import Pipeline
from sklearn.linear_model      import LogisticRegression
from sklearn.pipeline          import Pipeline
from xgboost                   import XGBRegressor
from sklearn.metrics           import roc_auc_score, make_scorer

import  os 



def downcast_dtypes(df):
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols = [c for c in df if df[c].dtype in ["int64", "int32"]]
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols] = df[int_cols].astype(np.int16)
    return df

def grid_search_cv(X,y,test,param_grid):
    auc = make_scorer(roc_auc_score, needs_proba=False)
    print("Parameter grid:\n{}".format(param_grid))
    grid_search = GridSearchCV(XGBRegressor(tree_method='gpu_hist'),param_grid,cv=5,verbose=20,n_jobs=4,scoring=auc)
    grid_search.fit(X,y)


    grid_search.best_estimator_.fit(X,y)   
    y_test_pred = grid_search.best_estimator_.predict(test)

    score = grid_search.best_score_
    return score, y_test_pred


train = pd.read_csv("/kaggle/input/tabular-playground-series-sep-2021/train.csv")
test  = pd.read_csv("/kaggle/input/tabular-playground-series-sep-2021/test.csv")
train.head()

train = downcast_dtypes(train)
test = downcast_dtypes(test)


missing_data_count = train.isnull().sum()
print("Missing Data in Training Set:")
print(missing_data_count)
total_data = np.product(train.shape)
total_missing = missing_data_count.sum()
percent_missing = (total_missing/total_data)*100
print("\nPercentage of Missing Data:")
print(percent_missing)


sns.heatmap(train.isnull())
plt.title("Missing Training Data\n")
plt.show()
for col in train.columns[missing_data_count.to_numpy().nonzero()[0]]:
        if train[col].dtype == 'object':
            train[col] = train[col].fillna(train[col].mode()[0])
        else:
            train[col] = train[col].fillna(train[col].mean())



missing_data_count = test.isnull().sum()
print("Missing Data in Test Set:")
print(missing_data_count)
sns.heatmap(test.isnull())
plt.title("Missing Test Data\n")
plt.show()
for col in test.columns[missing_data_count.to_numpy().nonzero()[0]]:
        if test[col].dtype == 'object':
            test[col] = test[col].fillna(test[col].mode()[0])
        else:
            test[col] = test[col].fillna(test[col].mean())


missing_data_count = train.isnull().sum()
print("Missing Data in Training Set After the Data Engineering:")
print(missing_data_count)
sns.heatmap(train.isnull())
plt.title("Missing Training Data After the Data Engineering:\n")
plt.show()
missing_data_count = test.isnull().sum()
print("Missing Test Data After the Data Engineering:")
print(missing_data_count)
sns.heatmap(test.isnull())
plt.title("Missing Test Data After the Data Engineering:\n")
plt.show()


object_cols = []
number_cols = []
for col in train.columns:
    if (train[col].dtype == 'object'):
        object_cols.append(col)
    else:
        number_cols.append(col)
print("Object Columns")
print(object_cols)
print("Numerical Columns")
print(number_cols)

     
print(train.shape)
print(test.shape)
    
y = train['claim']
X = train
X.drop(['claim'],axis=1,inplace=True)

label_encoder = LabelEncoder()
for col in object_cols:
    label_encoder.fit(X[col])
    X[col] = label_encoder.transform(X[col])
    test[col] = label_encoder.transform(test[col])


pca = IncrementalPCA()
n_batches = 1000
for X_batch in np.array_split(X, n_batches):
    pca.partial_fit(X_batch)
print("test4")
cumsum = np.cumsum(pca.explained_variance_ratio_)
plt.plot(range(1,len(cumsum)+1),cumsum)
plt.xlabel("Dimensions") 
plt.ylabel("Explained Variance") 
plt.show()

pipe = Pipeline([("scaler", MinMaxScaler()),("pca",IncrementalPCA(n_components=0.95,batch_size=1000))])
pipe.fit(X)
X = pipe.transform(X)
test = pipe.transform(test)




param_grid = {'n_estimators': [500,1000,2000],
             'learning_rate': [0.01,0.05,0.1]}
print("Parameter grid:\n{}".format(param_grid))

score_y, y_pred = grid_search_cv(X,y,test,param_grid)
print("Score is:", score_y)

test_set = pd.read_csv("/kaggle/input/tabular-playground-series-sep-2021/test.csv")
output  = pd.DataFrame({'id': test_set.id, 'claim': y_pred})
output.to_csv('my_submission_code9.csv',index=False)
