<div>
    <img src="https://i.imgur.com/QrL4fIn.png">
</div>

# Introduction üìó

> Tabular Playground Series entails synthetic data based on real data, which revolves around predicting the amount of an insurance claim. The competition data is generated using CTGAN.

In [None]:
from IPython.core.display import display, HTML, Javascript

def nb():
    styles = open("../input/intermediate-notebooks-data/custom_green.css", "r").read()
    return HTML("<style>"+styles+"</style>")
nb()

# 1. Importing libraries üìö

In [None]:
!pip install pycomp

import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import time
import cudf
import cuml
import xgboost
import shap

from random import randint
from pycomp.viz.insights import *
from cuml.linear_model import Ridge
from cuml.svm import SVR
from cuml.neighbors import KNeighborsClassifier
from cuml.metrics import accuracy_score,roc_auc_score as ras
from cuml.preprocessing.model_selection import train_test_split
from cuml.preprocessing.TargetEncoder import TargetEncoder

In [None]:
!nvidia-smi

In [None]:
!nvcc --version

In [None]:
def custom_palette(custom_colors):
    customPalette = sns.set_palette(sns.color_palette(custom_colors))
    sns.palplot(sns.color_palette(custom_colors),size=0.8)
    plt.tick_params(axis='both', labelsize=0, length = 0)

In [None]:
pink = ["#861388","#E15A97","#EEABC4","#C799A6","#4B2840"]
blue = ["#C6EBBE","#A9DBB8","#5887FF","#55C1FF","#E9D2F4"]
custom_palette(pink)
custom_palette(blue)

In [None]:
train_df = cudf.read_csv('../input/tabular-playground-series-mar-2021/train.csv', index_col='id')
train_df.head()

In [None]:
test_df = cudf.read_csv('../input/tabular-playground-series-mar-2021/test.csv', index_col='id')
test_df.head()

# 2. EDA üìä

In [None]:
train_p = train_df.to_pandas()
test_p = train_df.to_pandas()

dt_i=[]
dt_fl=[]
dt_o=[]

for col in train_p.columns:
    x=train_p[col].dtype
    if x=='int64':
        dt_i.append(col)
    elif x=='float64':
        dt_fl.append(col)
    else:
        dt_o.append(col)

In [None]:
print(dt_i)

In [None]:
print(dt_fl)

In [None]:
print(dt_o)

In [None]:
plot_donut_chart(df=train_p, col='target',
                 title='Target Value Distribution',colors=[blue[2],blue[1]])

In [None]:
def plot_cont(values,title,c):
    plt.figure(figsize = (18, 8))
    sns.kdeplot(values, color = pink[c])
    plt.title(title, fontsize=15)
    plt.show();

for i in dt_fl:
    c = randint(0, 4)
    values = train_p[i].value_counts().reset_index()[i].values
    plot_cont(values,i,c)

In [None]:
def plot_cat(index,values,title):
    if len(values)>10:
        p="spring"
        index=index[:50]
        values = values[:50]
    else:
        p=pink
        
    plt.figure(figsize = (18, 8))
    sns.barplot(x = index, y = values,palette=p)
    plt.title(title, fontsize=15)
    plt.show();
    
for i in dt_o:
    index = train_p[i].value_counts().reset_index()["index"].values
    values = train_p[i].value_counts().reset_index()[i].values
    plot_cat(index,values,i)

Correlation

In [None]:
plt.figure(figsize=(30,35))
corr=train_p.corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, cmap='YlGn', vmax=.3, center=0,
            square=True, linewidths=.5,annot=True)
plt.show()

In [None]:
indices = corr.index.values
cor_matrix = np.asmatrix(corr)
G = nx.from_numpy_matrix(cor_matrix)
G = nx.relabel_nodes(G,lambda x: indices[x])
G.edges(data=True)

In [None]:
def corr_network(G, corr_direction, min_correlation):
    H = G.copy()

    for s1, s2, weight in G.edges(data=True):       
        if corr_direction == "positive":
            if weight["weight"] < 0 or weight["weight"] < min_correlation:
                H.remove_edge(s1, s2)
        else:
            if weight["weight"] >= 0 or weight["weight"] > min_correlation:
                H.remove_edge(s1, s2)
                
    edges,weights = zip(*nx.get_edge_attributes(H,'weight').items())
    
    weights = tuple([(1+abs(x))**2 for x in weights])
   
    d = dict(nx.degree(H))
    nodelist=d.keys()
    node_sizes=d.values()
    
    positions=nx.circular_layout(H)
    
    plt.figure(figsize=(15,15))

    nx.draw_networkx_nodes(H,positions,node_color='#d100d1',nodelist=nodelist,
                       node_size=tuple([x**3 for x in node_sizes]),alpha=0.8)

    nx.draw_networkx_labels(H, positions, font_size=8)

    if corr_direction == "positive":
        edge_colour = plt.cm.cool 
    else:
        edge_colour = plt.cm.Wistia
        
    nx.draw_networkx_edges(H, positions, edgelist=edges,style='solid',
                          width=weights, edge_color = weights, edge_cmap = edge_colour,
                          edge_vmin = min(weights), edge_vmax=max(weights))
    plt.axis('off')
    plt.show() 

In [None]:
corr_network(G, corr_direction="positive",min_correlation = 0.5)

In [None]:
corr_network(G, corr_direction="negative",min_correlation = -0.3)

# 3. Target Encoding üéØ

In [None]:
%%time
SMOOTH = 0.001
SPLIT = 'interleaved'
FOLDS = 5

encoder = TargetEncoder(n_folds=FOLDS, smooth=SMOOTH, split_method=SPLIT)

In [None]:
%%time

X = train_df.drop(["target"],axis=1)
y = train_df["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, 
                                                    shuffle=False, stratify=y)
for col in train_df.columns:
    if train_df[col].dtype=='object': 
        X_train[col] = encoder.fit_transform(X_train[col],y_train)
        X_test[col] = encoder.transform(X_test[col])
        test_df[col] = encoder.transform(test_df[col])

# 4. Model Training üõ†Ô∏è

In [None]:
def training(model, X_train, y_train, X_test, y_test, model_name):
    t1 = time.time()
    
    model.fit(X_train, y_train)
    predicts = model.predict(X_test)
    roc = ras(y_test, predicts)
    
    t2 = time.time()
    training_time = t2-t1 
    
    print("\t\t\t--- Model:", model_name,"---")
    print("ROC: ", roc,"\t\t\t","Training time:",training_time,"\n")

In [None]:
ridge = Ridge(fit_intercept = True, normalize = False,solver = "eig")

svr = SVR(kernel='rbf', gamma='scale', C=1, epsilon=0.3)

knc =  KNeighborsClassifier(n_neighbors=3)

m = [ridge,svr,knc]
mn = ["Ridge","SVR","K Neighbors Classifier"]

for i in range(0,len(m)):
    training(model=m[i], X_train=X_train, y_train=y_train, X_test=X_test,y_test=y_test, model_name=mn[i])

In [None]:
dtrain = xgboost.DMatrix(X_train,y_train)
dval   = xgboost.DMatrix(X_test, y_test)
dtest  = xgboost.DMatrix(test_df)

params1 = { 'objective': 'binary:logistic',
    'booster': 'gbtree',
    'tree_method': 'gpu_hist',
    'eval_metric': 'auc',
    'random_state': 42,
    'max_depth': 15,
    'learning_rate': 0.03,
    'min_child_weight': 20,
    'gamma': 0.1,
    'alpha': 0.2,
    'lambda': 9,
    'colsample_bytree': 0.2,
    'subsample': 0.8}

evallist = [(dval, 'validation'), (dtrain, 'train')]
num_round=50

In [None]:
%%time

xgb = xgboost.train(params1, dtrain,num_round,evallist)

predicts = xgb.predict(xgboost.DMatrix(X_test))
roc = ras(y_test, predicts)
roc

In [None]:
preds1 = xgb.predict(xgboost.DMatrix(test_df))

ids = cudf.read_csv('../input/tabular-playground-series-mar-2021/sample_submission.csv')[["id"]].values

# 5. Feature Importance üìé

In [None]:
def feature_imp(model):
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test.to_pandas())
    shap.summary_plot(shap_values, X_test.to_pandas(), plot_type="bar",color='#7bf1a8')

    shap.summary_plot(shap_values, X_test.to_pandas())

    shap.dependence_plot("cat16", shap_values, X_test.to_pandas())

    shap.dependence_plot("cat15", shap_values, X_test.to_pandas())

    shap.dependence_plot("cat1", shap_values, X_test.to_pandas())

    shap.dependence_plot("cat14", shap_values, X_test.to_pandas())

In [None]:
%%time
feature_imp(xgb)

# 6. Creating the submission file üìã

In [None]:
df_sub = {'id': ids, 'target': preds1}
df_predictions = cudf.DataFrame(df_sub).set_index(['id'])
df_predictions.head(10)

In [None]:
df_predictions.to_csv('/kaggle/working/Predictions.csv')

Referencesüìú

* [NetworkX documentation](https://networkx.org/documentation/stable/tutorial.html)
* [Visualising stocks correlations with Networkx](https://towardsdatascience.com/visualising-stocks-correlations-with-networkx-88f2ee25362e)

Inspiration üí°
- [Custom Jupyter Notebook Theme with plain CSS](https://medium.com/@formigone/my-first-custom-theme-for-jupyter-notebook-a9c1e69efdfe) üé®

Illustrations tools ‚ö°
- [Canva](https://www.canva.com/en_gb/) üñåÔ∏è
- [Storyset](https://storyset.com/) üñºÔ∏è

<div>
    <img src="https://i.imgur.com/pl3FhXV.png">
 </div>