In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Purpose: Importing Libraries
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import zscore, chi2_contingency
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV, KFold, cross_validate, learning_curve

In [None]:
#Purpose: Reading data in from Kaggle
training = "/kaggle/input//tabular-playground-series-feb-2021/train.csv"
training_df = pd.read_csv(training)

testing = "/kaggle/input//tabular-playground-series-feb-2021/test.csv"
testing_df = pd.read_csv(testing)

In [None]:
#QC: Displaying head
training_df.head(10)

In [None]:
#Describe function
training_df.describe()

In [None]:
#Correlation Matrix 
fig, scatter = plt.subplots(figsize = (16,9))
corrMatrix = training_df.corr()
scatter = sn.heatmap(corrMatrix, annot = True)
scatter

Findings: (pearson corr p value)
1. cont0 has a +ve corr with cont5(0.58) cont8(0.58) cont9(0.52)
2. cont5 has a +ve corr with cont8(0.61) cont9(0.62) cont11(0.51) cont12(0.63)
3. cont8 has a +ve corr with cont9(0.56) cont12(0.53)
4. cont9 has a +ve corr with cont11(0.52) cont12(0.54)
5. cont10 has a +ve corr with cont11(0.56)

6. cont2 has a -ve corr with cont0 cont3 cont5 cont6 cont7 cont8 cont9 cont10 cont11 cont12 cont13

In [None]:
#Purpose: Plotting a histogram to see the distribution
fig = plt.figure(figsize = (15,20))
ax = fig.gca()
training_df.hist(ax = ax, bins = 50)

In [None]:
#Creating a dummy dataframe and removing id and target 
training_wo_id_target = training_df.drop(columns = ["id", "target"]) 

In [None]:
#Purpose: Plotting a boxplot to see the outlier distribution
fig = plt.figure(figsize = (15,5))
ax = fig.gca()
training_wo_id_target.boxplot(ax = ax)

**Outlier Detection using Z Score!** 


A negative Z-score means an observation is below the mean, while a positive one means it above it. The further away from 0 the Z-Score is, the further away from the mean your observation is.

One way to identify outliers is to determine which points have a z-score that's far from 0.

In [None]:
#Calculating the score
numeric_cols = training_wo_id_target.select_dtypes(include=[np.number]).columns
scores = training_wo_id_target[numeric_cols].apply(zscore)  

In [None]:
#Calculating the upper whisker outliers 
score_max = pd.DataFrame(scores>3)
for col in score_max:
    print(score_max[col].value_counts())

In [None]:
#Calculating the lower whisker outliers 
score_min = pd.DataFrame(scores<-3)
for col in score_min:
    print(score_min[col].value_counts())

In [None]:
columns = ['cat0','cat1','cat2','cat3','cat4','cat5','cat6','cat7','cat8','cat9']
for column in columns:
    print(column)
    print(training_df[column].value_counts())
    print("======")

In [None]:
y = training_df['target']
x = training_df.drop(['target','id'], axis = 1)

In [None]:
training_df_cat = training_df.select_dtypes("object").columns

In [None]:
training_df_cat

In [None]:
ct = ColumnTransformer(transformers=[['oe',OrdinalEncoder(),training_df_cat]],remainder='passthrough')

In [None]:
pipeline = Pipeline(steps=[['ord_encoder',ct],
                          ['rfe',RFE(estimator=xgb.XGBRegressor(tree_method='gpu_hist',random_state=11,n_jobs=-1),
                                    n_features_to_select=20)],
                          ['regressor',xgb.XGBRegressor(tree_method='gpu_hist',random_state=11,n_jobs=-1,
                                                       max_depth=4,n_estimators=200,reg_lambda=100)]])

In [None]:
pipeline.fit(x,y)

In [None]:
cv = cross_validate(estimator=pipeline,X=x,y=y,scoring='neg_root_mean_squared_error',cv=5,n_jobs=-1,return_train_score=True)

In [None]:
cv

In [None]:
x_cont = x.select_dtypes("float64")
x_cont

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca_breast = PCA(n_components=2)
principalComponents_breast = pca_breast.fit_transform(x_cont)

In [None]:
principal_breast_Df = pd.DataFrame(data = principalComponents_breast, columns = ['pc1', 'pc2'])

In [None]:
new  = training_df[training_df_cat].join(principal_breast_Df)

In [None]:
new

In [None]:
cv = cross_validate(estimator=pipeline,X=new,y=y,scoring='neg_root_mean_squared_error',cv=5,n_jobs=-1,return_train_score=True)

In [None]:
cv