In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# For Training the Model
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split# Sampling the datasets
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error


In [2]:
import warnings
warnings.filterwarnings("ignore")

train_datasets = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2021/train.csv")
test_datasets = pd.read_csv("/kaggle/input/tabular-playground-series-feb-2021/test.csv")

In [3]:
train_datasets.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target
0,1,A,B,A,A,B,D,A,E,C,...,0.881122,0.42165,0.741413,0.895799,0.802461,0.724417,0.701915,0.877618,0.719903,6.994023
1,2,B,A,A,A,B,B,A,E,A,...,0.440011,0.34623,0.278495,0.593413,0.546056,0.613252,0.741289,0.326679,0.808464,8.071256
2,3,A,A,A,C,B,D,A,B,C,...,0.914155,0.369602,0.832564,0.86562,0.825251,0.264104,0.695561,0.869133,0.828352,5.760456
3,4,A,A,A,C,B,D,A,E,G,...,0.934138,0.57893,0.407313,0.868099,0.794402,0.494269,0.698125,0.809799,0.614766,7.806457
4,6,A,B,A,A,B,B,A,E,C,...,0.3826,0.70594,0.325193,0.440967,0.462146,0.724447,0.683073,0.343457,0.297743,6.868974


In [4]:
conti_columns = [col for col in train_datasets.columns if col.startswith("cont")]
print(conti_columns)
print(len(conti_columns))

cat_columns = [col for col in train_datasets.columns if col.startswith("cat")]
print(cat_columns)

['cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13']
14
['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9']


In [5]:
labelEncodeing = LabelEncoder()
for individual_column in cat_columns:
    train_datasets[individual_column] =  labelEncodeing.fit_transform(train_datasets[individual_column])
    test_datasets[individual_column] = labelEncodeing.fit_transform(test_datasets[individual_column])

In [6]:
scaling = StandardScaler()
train_datasets[conti_columns] =  scaling.fit_transform(train_datasets[conti_columns])
test_datasets[conti_columns] = scaling.fit_transform(test_datasets[conti_columns])

In [7]:
X = train_datasets[cat_columns + conti_columns]
y = train_datasets["target"]

In [9]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [10]:
appraoch_model = GradientBoostingRegressor()
appraoch_model.fit(x_train,y_train)

In [11]:
predictions_result  = appraoch_model.predict(x_test)

final_result_mae= mean_absolute_error(y_test, predictions_result)
final_result_mse= mean_squared_error(y_test, predictions_result)
final_result_r2= r2_score(y_test, predictions_result)

In [None]:
print(final_result_mae)

In [None]:
print(final_result_mse)

In [None]:
print(final_result_r2)

In [None]:
plt.figure(figsize = (10, 6))
sns.scatterplot(x = y_test, y = predictions_result, alpha = 0.6)
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Actual vs Predicted ")
plt.grid(True)
plt.show()


plt.figure(figsize = (10, 6))
sns.histplot(y_test - predictions_result, kde = True, bins = 30)
plt.xlabel("Residual")
plt.ylabel("Frequency/ Range")
plt.title("Actual vs Predicted ")
plt.grid(True)
plt.show()
