In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from scipy.special import erfinv

import matplotlib.pyplot as plt
import plotly.express as px

import cuml
from cuml.neighbors import KNeighborsRegressor as KNR

import warnings
warnings.filterwarnings("ignore")

In [None]:
folds = 10

In [None]:
train = pd.read_csv("../input/tabular-playground-series-jan-2021/train.csv")
X = train.iloc[:,1:15]
Y = train.iloc[:,15]
test = pd.read_csv("../input/tabular-playground-series-jan-2021/test.csv")
test = test.iloc[:,1:15]

Let´s take a look at the data:

In [None]:
fig, p = plt.subplots(7, 2, figsize=(20,40))

r=0
c=0
i="cont1"
for i in X.columns:
 
    p[r, c].scatter(X[i], Y, 
                    lw=2, 
                    color="#440154FF", 
                    alpha=0.4,
                    edgecolors='#FDE725FF')
    
    p[r, c].set_xlabel(i)
    
    if c == 0:
        p[r, c].set_ylabel('target')
    
    if r == 0:
        p[r, c].set_title('Scatter Plot')
    
    if c < 1:
        c+=1
    else:
        c=0
        r+=1
        
plt.show()     

In [None]:
fig, p = plt.subplots(7, 2, figsize=(20,40))

n_bins = 50

r=0
c=0
i="cont1"
for i in X.columns:
    
    tr = X[i]
    te = test[i]
    
    p[r, c].hist(tr, 
                 n_bins, 
                 density=True, 
                 histtype='bar', 
                 color="#440154FF", 
                 label='train', 
                 linestyle='dashed',
                 edgeColor = 'white')
    
    p[r, c].hist(te, 
                 n_bins, 
                 density=True, 
                 histtype='bar', 
                 color="#FDE725FF", 
                 label='test', 
                 alpha=0.6)
    
    p[r, c].legend(loc='upper right')
    p[r, c].set_xlabel(i)
    
    if c == 0:
        p[r, c].set_ylabel('frequency')
    
    if r == 0:
        p[r, c].set_title('Histogram')
    
    if c < 1:
        c+=1
    else:
        c=0
        r+=1
        
plt.show()  

**Normalization for PCA using Rank Gauss**

In [None]:
def rg(df, e, start, end):
    for i in df.columns[start:end]:
        r = df[i].rank()
        Range = (r/np.max(r)-0.5)*2
        Range = np.clip(Range, a_max = 1-e, a_min = -1+e)
        rg = erfinv(Range)
        df[i] = rg * 2**0.5
    return df

In [None]:
def norm(df, start, end):
    for i in df.columns[start:end]:
        df[i] = (df[i] - np.mean(df[i])) / np.std(df[i])
    return df

In [None]:
X["part"] = 1
test["part"] = 0

Xx = pd.concat([X, test])
Xx = Xx.reset_index(drop=True)
Xx.loc[Xx.index.duplicated(),:]

Xx = rg(df = Xx, e = 0.00001, start = 0, end = 14)

**This leads to perfect normal distribution**

In [None]:
fig, p = plt.subplots(7, 2, figsize=(20,40))

n_bins = 50

r=0
c=0
i="cont1"
for i in Xx.columns[0:len(Xx.columns)-1]:
    
    tr = Xx.loc[Xx["part"] == 1, i]
    te = Xx.loc[Xx["part"] == 0, i]
    
    p[r, c].hist(tr, 
                 n_bins, 
                 density=True, 
                 histtype='bar', 
                 color="#440154FF", 
                 label='train', 
                 linestyle='dashed',
                 edgeColor = 'white')
    
    p[r, c].hist(te, 
                 n_bins, 
                 density=True, 
                 histtype='bar', 
                 color="#FDE725FF", 
                 label='test', 
                 alpha=0.6)
    
    p[r, c].legend(loc='upper right')
    p[r, c].set_xlabel(i)
    
    if c == 0:
        p[r, c].set_ylabel('frequency')
    
    if r == 0:
        p[r, c].set_title('Histogram')
    
    if c < 1:
        c+=1
    else:
        c=0
        r+=1
        
plt.show()  

**PCA**

In [None]:
treshold = 0.975

pca = PCA()
pca.fit(Xx.iloc[:,0:14])
cumPCA = np.cumsum(pca.explained_variance_ratio_)
n_comps = np.min(np.where(cumPCA >= treshold))

In [None]:
cols = list(("PC" + str(i) for i in range(1,15))) #generator
xPos = list(range(len(cumPCA)))

p = plt.figure(figsize = (16,9))
p = plt.bar(xPos, cumPCA, color="#440154FF")
p = p[n_comps].set_color('#FDE725FF')
p = plt.axhline(y=treshold,linewidth=2, color="#1F968BFF", alpha=0.8, ls='dashed')
p = plt.xticks(xPos, cols)  

In [None]:
pca = PCA(n_components = n_comps+1)
PCs = pca.fit_transform(Xx.iloc[:,0:14])

pcaXx = pd.DataFrame(data=PCs, #values
                     index=list(range(len(PCs))), #rows   
                     columns=cols[0:n_comps+1] #columns
                    )

In [None]:
X = pcaXx.loc[Xx["part"] == 1, pcaXx.columns[0:11]]
test = pcaXx.loc[Xx["part"] == 0, pcaXx.columns[0:11]]

Let´s take a look at the three most important principal components.

In [None]:
viz = pd.concat([X,Y], axis = 1)
viz = viz.loc[viz["target"] > 4.5,:]
z =  np.random.uniform(low=0.0, high=1.0, size=len(viz))
viz = viz.loc[z > 0.8,:]

fig = px.scatter_3d(viz, 
                    x='PC1', 
                    y='PC2', 
                    z='PC3',
                    color='target',
                    hover_data={'PC1': False, 
                                'PC2': False,
                                'PC3': False,
                                'target': True
                         },
                 opacity=1,
                 color_continuous_scale=px.colors.sequential.Viridis,
                 title="PCA")

fig.update_traces(marker=dict(size=4,
                              line=dict(width=1,
                                        color='grey')),
                  selector=dict(mode='markers'))
fig.show()

**KNN**

In [None]:
KFold = KFold(n_splits=folds, 
              shuffle=False, 
              random_state=123)

In [None]:
%%time

MSE = []
k = 0

for train_index, test_index in KFold.split(X):
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]
    model = KNR(n_neighbors = 200)
    model.fit(X_train, y_train)
    PredCV = model.predict(X_test)
    MSE.append(mean_squared_error(y_test, PredCV))
    
    if k == 0:
        Pred = model.predict(test) / folds
    else:
        Pred = Pred + model.predict(test) / folds
        
    k +=1
    

In [None]:
print("RMSE by fold is: ", np.sqrt(MSE))

In [None]:
SamSub = pd.read_csv("../input/tabular-playground-series-jan-2021/sample_submission.csv")
SamSub["target"] = Pred

SamSub.to_csv("submission.csv", index=False)