In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt, seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, KFold,\
GridSearchCV, RandomizedSearchCV
from sklearn.decomposition import PCA, IncrementalPCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import r2_score, mean_squared_error
from xgboost import XGBRegressor

In [None]:
df = pd.read_csv('/kaggle/input/yeh-concret-data/Concrete_Data_Yeh.csv')
df.head()

In [None]:
df.info()

In [None]:
sns.pairplot(df)

In [None]:
df.plot(x='csMPa', kind='box', subplots=True, layout=(2,4), figsize=(20,10))
plt.show()

In [None]:
# Removing outlier for age feature
Q3 = df.age.quantile(0.75)
Q1 = df.age.quantile(0.25)
IQR = Q3-Q1
upper = Q3+(1.5*IQR)
lower = Q1-(1.5*IQR)

df = df[(df.age>=lower) & (df.age<=upper)]
df.info()

In [None]:
df.age.plot.box()

In [None]:
# Splitting data into train and test data sets
X = df.drop('csMPa',axis=1)
y = df.csMPa

X_train,X_test, y_train,y_test = train_test_split(X,y, train_size=0.7, random_state=100)

In [None]:
# Performing PCA
pca_comps = [2,3,4,5,6,7]
evr = []

for i in pca_comps:
    pca = PCA(n_components=i)
    pca.fit_transform(X_train)
    evr.append(sum(pca.explained_variance_ratio_))
    
plt.plot(pca_comps, evr, marker='o')
plt.grid(axis='y', alpha=0.7)
plt.show()

In [None]:
# Ridge Regression Model
ridge = Pipeline([('scaler', MinMaxScaler()),
                 ('pca', IncrementalPCA()),
                 ('ridge', Ridge())])

folds = KFold(n_splits=5, shuffle=True, random_state=56)
hyp = {'ridge__alpha':[0.00000001,0.000001,0.0001,0.01,0.1,0.15,0.18,0.2,0.24,0.3,0.6,0.8],
      'pca__n_components':[2,3,4,5,6,7,8]}

grid_ridge = GridSearchCV(estimator=ridge, param_grid=hyp, cv=folds, n_jobs=-1, scoring='r2',
                   verbose=1, return_train_score=True)
grid_ridge.fit(X_train,y_train)

grid_ridge.best_score_

In [None]:
ridge_model = grid_ridge.best_estimator_
ridge_model

In [None]:
# XGBoost Model
xgb = Pipeline([('scaler', MinMaxScaler()),
                 ('pca', IncrementalPCA()),
                 ('xgb', XGBRegressor(random_state=50))])

cv_score = cross_val_score(estimator=xgb, X=X_train, y=y_train, cv=folds, scoring='r2')
cv_score.mean()

In [None]:
# Hyperparameter tuning of XGBoost model using RandomsearchCV
hyp = [{'pca__n_components':[2,3,4,5,6,7,8],
       'xgb__n_estimators':[100,200,300,400],
       'xgb__learning_rate':[0.05,0.08,0.1,0.2,0.4,0.6,0.8],
       'xgb__gamma':[0.000001,0.0001,0.01,0.1,0.3,0.5,0.8],
       'xgb__reg_lambda':[0.000000001,0.000001,0.001,0.01,0.1,0.5,0.7]}]

rnd = RandomizedSearchCV(estimator=xgb, param_distributions=hyp, n_iter=50, n_jobs=-1,
                        cv=folds, scoring='r2', verbose=1, random_state=10)
rnd.fit(X_train,y_train)

In [None]:
rnd.best_score_, rnd.best_params_

In [None]:
# Performing GridsearchCV corresponding to the results of RandomsearchCV
xgb = Pipeline([('scaler', MinMaxScaler()),
                 ('pca', IncrementalPCA(n_components=8)),
                 ('xgb', XGBRegressor(n_estimators=400,random_state=50))])

grid_hyp = {'xgb__reg_lambda': [0.09,0.10,0.11],
            'xgb__learning_rate': [0.07,0.08,0.09,0.1],
            'xgb__gamma': [0.000001,0.00001,0.0000001]}

grid_xgb = GridSearchCV(estimator=xgb, param_grid=grid_hyp, n_jobs=-1, cv=folds,
                       scoring='r2', verbose=1)
grid_xgb.fit(X_train,y_train)

grid_xgb.best_score_

In [None]:
xgb_model = grid_xgb.best_estimator_
xgb_model

In [None]:
# StackingRegressor Model
models = [('ridge', ridge_model), ('xgb', xgb_model)]

lr = LinearRegression()
stk = StackingRegressor(estimators=models, final_estimator=lr)

cv_score = cross_val_score(estimator=stk, X=X_train, y=y_train, cv=folds, scoring='r2')
cv_score.mean()

### Stacking Regressor combining the before two models performs better.

In [None]:
# Training and prediction
stk.fit(X_train,y_train)
y_pred = stk.predict(X_test)

In [None]:
r2 = r2_score(y_test, y_pred)
RMSE = (mean_squared_error(y_test, y_pred))**0.5
print('Validation R-squared score = {0}'.format(round(r2,2)))
print('RMSE = {0}'.format(round(RMSE,2)))

In [None]:
pred = pd.DataFrame({'actual':y_test, 'pred':y_pred})
pred['error'] = pred.actual - pred.pred
pred = pred.sort_index()
pred.head()

In [None]:
plt.figure(figsize=(20,10))
plt.plot(pred.actual, label='actual')
plt.plot(pred.pred, label='predicted')
plt.xlabel('index', fontsize=20)
plt.ylabel('csMPa', fontsize=20)
plt.legend(fontsize=20)
plt.show()

In [None]:
plt.figure(figsize=(20,8))
plt.scatter(pred.index,pred.error, color='red')
plt.axhline(0)
plt.xlabel('index', fontsize=20)
plt.ylabel('errors', fontsize=20)
plt.show()