In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#import packages needed

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import lightgbm as lgb

import math

In [None]:
#import training and test data

train = pd.read_csv("../input/tabular-playground-series-feb-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-feb-2021/test.csv")
submission = pd.read_csv("../input/tabular-playground-series-feb-2021/sample_submission.csv")

In [None]:
#take a look at first 5 columns of training data
train.head()

In [None]:
#check out some info on the training data
train.describe()

In [None]:
#view test data
test.head()

In [None]:
#check the shapes of test and training data
train.shape, test.shape

In [None]:
#extract categorical and continuous variables

cat_cols = [feature for feature in train.columns if 'cat' in feature]
cont_cols = [feature for feature in train.columns if 'con' in feature]

#count of categorical and continuous columns:

print("Number of categorical columns: " + str(len(cat_cols)))
print("Number of continous columns: " + str(len(cont_cols)))

In [None]:
#graph distributions of continuous variables

num_rows, num_cols = 4,4
f, axes = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(12, 12))
f.suptitle('Distribution of Features', fontsize=16)

for index, column in enumerate(train[cont_cols].columns):
    i,j = (index // num_cols, index % num_cols)
    g = sns.kdeplot(train[column], color="m", shade=True, label="%.2f"%(train[column].skew()), ax=axes[i,j])
    g = g.legend(loc="best")

f.delaxes(axes[3, 2])
f.delaxes(axes[3, 3])
plt.tight_layout()
plt.show()

In [None]:
#create a correlation matrix:

corr = train[cont_cols].corr().abs()
mask = np.triu(np.ones_like(corr, dtype=np.bool)) #only focusses on lower area of triangle

fig, ax = plt.subplots(figsize=(14, 14))

# plot heatmap
sns.heatmap(corr, mask=mask, annot=True, fmt=".2f", cmap='coolwarm',
            cbar_kws={"shrink": .8}, vmin=0, vmax=1)
# yticks
plt.yticks(rotation=0)
plt.show()

In [None]:
# Create training,test and target variables

X_train = train.drop(['id', 'target'], axis=1)
y_train = train.target
X_test = test.drop(['id'], axis=1)

In [None]:
for feature in cat_cols:
    le = LabelEncoder()
    le.fit(train[feature])
    X_train[feature] = le.transform(X_train[feature])
    X_test[feature] = le.transform(X_test[feature])

In [None]:
#Split into training and validation data

x_tr, x_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.3, random_state=42)

In [None]:
#create XG Boost model

xgmodel = XGBRegressor(random_state=42, tree_method='gpu_hist')

xgmodel.fit(x_tr, y_tr)

In [None]:
# Use the forest's predict method on the validation data
predictions_val = xgmodel.predict(x_val)

score_rmse_val = math.sqrt(mean_squared_error(y_val, predictions_val))
print('Base XGBoost RMSE - Validation: {}'.format(score_rmse_val))

In [None]:
# Use the forest's predict method on the test data
predictions = xgmodel.predict(X_test)

predictions


In [None]:
submission

In [None]:
submission['target'] = predictions
submission.to_csv('prediction_markstent-PlaygroundFeb1.csv', index=False)
submission.head()