
# XGB Regressor


In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import matplotlib.pyplot as plt       

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


from xgboost import XGBRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LinearRegression

        
input_path = Path('/kaggle/input/tabular-playground-series-jan-2021/')

# Read in the data files

In [None]:
train = pd.read_csv(input_path / 'train.csv', index_col='id')
print('Train df')
display(train.head())
print('Test df')
test = pd.read_csv(input_path / 'test.csv', index_col='id')
display(test.head())
print('Sample submission')
submission = pd.read_csv(input_path / 'sample_submission.csv', index_col='id')
display(submission.head())

In [None]:
%matplotlib inline
train.hist(bins=100,figsize=(20,15))

In [None]:
train.plot(figsize=(12,6))
plt.show()

In [None]:
# box plot
train.boxplot(figsize=(12,6))
plt.show()

In [None]:
# drop outliers in target
train=train.drop(train[train.target < 4.4].index)

In [None]:
corr_matrix=train.corr()
corr_matrix['target'].sort_values(ascending = False)


Plots for attributes with the biggest correlation

In [None]:

from pandas.plotting import scatter_matrix
attributes = ['cont7', 'cont2', 'cont3', 'cont11', 'cont12']
scatter_matrix(train[attributes], figsize = (12,8))

## Pull out the target, and make a validation split

In [None]:
target = train.pop('target')
X_train, X_test, y_train, y_test = train_test_split(train, target, train_size=0.80)

In [None]:
X=train
y=target

In [None]:
X.plot()
plt.show()

In [None]:
y.plot()
plt.show()

# Choosing model

Build model with default parameters.

In [None]:
xgbreg=XGBRegressor()

xgbreg.fit(X_train, y_train)
y_pred = xgbreg.predict(X_test)
score = mean_squared_error(y_test, y_pred, squared=False)
print('XGBRegressor default parameters - mean squared error: ', score)

Parameters choosing

In [None]:

xgbreg_mod = XGBRegressor(n_estimators=100, learning_rate=0.1, n_jobs=4)
xgbreg_mod.fit(X_train, y_train)
y_pred=xgbreg.predict(X_test)
score = mean_squared_error(y_test, y_pred, squared=False)
print('Mean squared error: ', score)

In [None]:
#SGDRegressor
sgd_reg = SGDRegressor(max_iter=100, penalty=None, eta0=0.05)
sgd_reg.fit(X_train, y_train.ravel())
y_pred_sgd=sgd_reg.predict(X_test)
score = mean_squared_error(y_test, y_pred_sgd, squared=False)
print('SGDRegressor Mean squared error: ', score)

In [None]:
# Polynomial regression
from sklearn.preprocessing import PolynomialFeatures
poly_features = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly_features.fit_transform(X_train)
X_poly_test = poly_features.fit_transform(X_test)
print(X_train.head())
print(X_poly)


In [None]:

lin_reg = LinearRegression()
lin_reg.fit(X_poly, y_train)

y_pred_lin_reg=lin_reg.predict(X_poly_test)
score = mean_squared_error(y_test, y_pred_lin_reg, squared=False)
print('LinearRegression with poly - mean squared error: ', score)

In [None]:
sgd_reg = SGDRegressor(max_iter=50, penalty=None, eta0=0.1)
sgd_reg.fit(X_poly, y_train.ravel())
y_pred_sgd=sgd_reg.predict(X_poly_test)
score = mean_squared_error(y_test, y_pred_sgd, squared=False)
print('SGDRegressor with polynominal transform mean squared error: ', score)

In [None]:
xgb_poly = XGBRegressor(n_estimators=100, learning_rate=0.1)
xgb_poly.fit(X_poly, y_train)

In [None]:
y_pred_xgb_poly=lin_reg.predict(X_poly_test)
score = mean_squared_error(y_test, y_pred_xgb_poly, squared=False)
print('Mean squared error: ', score)

Best model

In [None]:
my_model=xgb_poly

In [None]:
my_y_pred =my_model.predict(X_poly_test)
score = mean_squared_error(y_test, my_y_pred, squared=False)
print(score)

## Make submissions

In [None]:
poly_test = poly_features.fit_transform(test)
submission['target'] = my_model.predict(poly_test)
submission.to_csv('poly_xgb_regressor.csv')