# Import Libraries Here

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from numpy import nan
from numpy import absolute

import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.graphics.regressionplots import *
from yellowbrick.regressor import CooksDistance

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, SGDRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from xgboost.sklearn import XGBRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import RandomizedSearchCV


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv("/kaggle/input/gccsv/compresive_strength_concrete.csv")
print('Dataset Shape:{}'.format(df.shape))
df.head()

In [None]:
df.columns = ["Cement", "BlastFurnaceSlag", "FlyAsh", "Water", "Superplasticizer",
              "CoarseAggregate", "FineAggregare", "Age", "CC_Strength"]

In [None]:
print(df.info(), '\n')
print(df.isnull().sum())

In [None]:
df.describe()

In [None]:
sns.pairplot(df)

In [None]:
fig, ax = plt.subplots(3,3, figsize=(17,12), constrained_layout=True)
ax=ax.flatten()
sns.set_style("darkgrid")
for num, col in enumerate(df.columns):
    sns.distplot(df[col], ax=ax[num])
plt.show()

In [None]:
plt.figure(figsize=(12,9))
sns.heatmap(df.corr(), annot=True, fmt='.2f')

In [None]:
correlation_unstakced = df.corr().unstack().sort_values(ascending = False)
correlation_unstakced = correlation_unstakced[correlation_unstakced != 1]
correlation_unstakced = np.round(correlation_unstakced, 2)
correlation_unstakced.head(10)

Concrete Strength has high positive correlation with Cement (0.50), Superplasticizer (0.37), and Age (0.33)

Concrete Strength has high positive correlation with Water (-0.29)

In [None]:
fig, ax = plt.subplots(figsize = (12, 10))
sns.scatterplot(data = df, y = 'CC_Strength', x = 'Cement', hue = 'Age', size = 'Water', 
                ax = ax, sizes = (30, 250))   

# Outlier Detection

In [None]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
model = sm.OLS(y_train, sm.add_constant(X_train)).fit()   # To create a model 
model_summary = model.summary()
print(model_summary)

### Determine influential points which have high residuals and high leverage


In [None]:
influence = model.get_influence()
influence_summary = influence.summary_frame()
print(influence_summary)

# To studentized the residuals
studentized_residuals = influence.resid_studentized_external   
# Apply Cooks' Distance
(cooks, p) = influence.cooks_distance
# Apply DFFITS
(dffits, p) = influence.dffits 
# The diagonals of the hat matrix indicate the amount of leverage (influence) that observations have in a least squares regression
leverage = influence.hat_matrix_diag  

print('\n')
print('Leverage vs. Studentized Residuals')

sns.regplot(leverage, model.resid_pearson, fit_reg=False) 

plt.title('Leverage vs. Studentized Residuals')
plt.xlabel('Leverage')
plt.ylabel('Studentized Residuals')
plt.show()

In [None]:
influence_summary

In [None]:
y_df = pd.DataFrame(y_train, columns = ['CC_Strength'])
concat_y_df = pd.concat([y_df, influence_summary], axis = 1)
concat_y_df

In [None]:
# Identity Outliers Manually
studentized_residual = concat_y_df.student_resid
concat_y_df.CC_Strength[abs(studentized_residual) >3]

In [None]:
features = df.columns[:-1].to_list()

#Number of observations
n = df.shape[0]

#Predictors
k = df[features].shape[1]

#Leverage
cutoff_leverage = ((2*k)+2)/n

In [None]:
# High leverage data points
leverage = concat_y_df.hat_diag
print(concat_y_df.CC_Strength[abs(leverage) > cutoff_leverage])

In [None]:
# student_resid & Get outliers and high leverage data points
outliers_student_resid = concat_y_df[abs(concat_y_df['student_resid'])>3]
high_leverage = concat_y_df[abs(leverage) > cutoff_leverage]
outliers_student_resid.shape, high_leverage.shape

In [None]:
# Observations with high leverage, or large residuals are labeled in the plot to show potential influence points.
fig, ax = plt.subplots(1, 1, figsize = (15, 8))
fig = sm.graphics.influence_plot(model, ax=ax, criterion = "cooks", alpha = 0.5)

In [None]:
# Visualize outliers using cook's distance
# Data points that are higher than the red dotted line are considered to be outliers
outliers_cooks = CooksDistance()
outliers_cooks.fit(X, y)
outliers_cooks.show()

In [None]:
# set cutoff and outliers using Cooks Distance
cutoff_cooks =concat_y_df.loc[:,"cooks_d"].mean()*3
outliers_cooks = concat_y_df.cooks_d[abs(concat_y_df.cooks_d) > cutoff_cooks]

In [None]:
# set cutoff and outliers using DFFITS
cutoff_dffits = 2* np.sqrt(k/n)
outliers_dffits = concat_y_df[abs(concat_y_df.dffits) > cutoff_dffits]

In [None]:
index_student_resid = outliers_student_resid.index.to_list()
index_cooks = outliers_cooks.index.to_list()
index_dffits = outliers_dffits.index.to_list()

In [None]:
index_list = [index_student_resid, index_cooks, index_dffits]
model_names = ['Studentized Residuals', 'Cooks Distance', 'DFFITS']
features = df.columns[:-1].to_list()
AIC = []
BIC = []
for index, name in zip(index_list, model_names):
    X = df[features].drop(index).values
    y = df['CC_Strength'].drop(index).values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
    model = sm.OLS(y_train, sm.add_constant(X_train)).fit()
    AIC.append(model.aic)
    BIC.append(model.bic)

In [None]:
compare_model = pd.DataFrame(model_names, columns = ['Model Name'])
compare_model['AIC'] = AIC
compare_model['BIC'] = BIC
compare_model.sort_values(by = ['AIC', 'BIC'], ascending = True)
# A low value of AIC and BIC means less information is lost, thus a better model
# The lower the values of AIC and BIC, the better the model

In [None]:
# DFFITS has the lower value of AIC and BIC
# Therefore, we are going to use DFFITS to remove outliers
df_dffits_removed = df.drop(index_dffits)
df_dffits_removed.shape

In [None]:
df_dffits_removed.boxplot()

# Compare the number of outliters before and after using DFFITS

In [None]:
# Number of outliers from the original dataset
outlier_num_list = []
for col in df.columns:
    outlier_num = df[((df[col] - df[col].mean())/ df[col].std())> 3][col].count()
    outlier_num_list.append(outlier_num)
outliers_df = pd.DataFrame(df.columns, columns = ['Features'])
outliers_df['outliers_num'] = outlier_num_list
outliers_df.sort_values(by = 'outliers_num', ascending = False)

In [None]:
# Number of outliers after using dffits
outlier_num_list = []
for col in df_dffits_removed.columns:
    outlier_num = df_dffits_removed[((df_dffits_removed[col] - df_dffits_removed[col].mean())/ df_dffits_removed[col].std()) > 3][col].count()                         
    outlier_num_list.append(outlier_num)
outliers_df = pd.DataFrame(df_dffits_removed.columns, columns = ['Features'])
outliers_df['outliers_num'] = outlier_num_list
outliers_df.sort_values(by = 'outliers_num', ascending = False)

In [None]:
# There are still some outliers. I am going to replace those outliers with median

df_median = df_dffits_removed.copy()

for col in df_median.columns:
    Q1 = df_median[col].quantile(0.25)
    Q3 = df_median[col].quantile(0.75)
    IQR = Q3 - Q1
    low = Q1 - 1.5*IQR
    high = Q3 + 1.5*IQR
    df_median.loc[(df_median[col] < low) | (df_median[col] > high), col] = df_median[col].median() 
  

In [None]:
outlier_num_list = []
for col in df_median.columns:
    outlier_num = df_median[((df_median[col] - df_median[col].mean())/ df_median[col].std()) > 3][col].count()                         
    outlier_num_list.append(outlier_num)
outliers_df = pd.DataFrame(df_median.columns, columns = ['Features'])
outliers_df['outliers_num'] = outlier_num_list
outliers_df = outliers_df.sort_values(by = 'outliers_num', ascending = False)
outliers_df    

In [None]:
fig , ax = plt.subplots(3, 3, squeeze=True, figsize=(15, 15))
ax = ax.flatten()
for num, col in enumerate(df_median.columns):
    sns.boxplot(x=col, data = df_median, ax = ax[num])
    

# Start building a model

In [None]:
X = df_dffits_removed.iloc[:, :-1]
y = df_dffits_removed.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle=True, random_state=42)

sc = StandardScaler()
minmax = MinMaxScaler()
X_train = minmax.fit_transform(X_train)
X_test = minmax.transform(X_test)

lin_reg = LinearRegression()
lasso = Lasso()
ridge = Ridge()
elastic_net = ElasticNet()
sgd_reg = SGDRegressor()
rand_reg = RandomForestRegressor()
tree_reg = DecisionTreeRegressor()
gb_boost = GradientBoostingRegressor()
ada_boost = AdaBoostRegressor()
knn_reg = KNeighborsRegressor()
svm = SVR(kernel='linear')
xgb_reg = XGBRegressor()

regressor_list = [lin_reg, lasso, ridge, elastic_net, sgd_reg, rand_reg, 
                  tree_reg, gb_boost, ada_boost, knn_reg, svm, xgb_reg]                             

In [None]:
rmse = []
mse = []
mae = []
r2 = []
y_predicted = []
for reg in regressor_list:
    reg.fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    y_predicted.append(y_pred)
    rmse.append(np.sqrt(mean_squared_error(y_test, y_pred)))
    mse.append(mean_squared_error(y_test, y_pred))
    mae.append(mean_absolute_error(y_test, y_pred))
    r2.append(r2_score(y_test, y_pred))

In [None]:
fig, ax = plt.subplots(4, 3, sharex = True, sharey = True, figsize = (15,13))
models = ['Linear Regression', 'Lasso Regression', 'Ridge Regression', 'Elastic Net', 'SGD Regressor',
         'RandomForest Regressor', 'DecisionTree Regressor', 'GradientBoost Regression', 
          'AdaBoost Regressor', 'KNN Regressor', 'SVM', 'XGBoost Regressor']
y_pred_models = y_predicted
ax = ax.flatten()
for num, (pred, model) in enumerate(zip(y_pred_models, models)):
    ax[num].scatter(pred, y_test, s=20)
    ax[num].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
    ax[num].set_title(model, fontsize = 14)
    
fig.supxlabel('Predicted Values', fontsize = 14)
fig.supylabel('True Values', fontsize = 14)
plt.suptitle("True Values vs Predicted Values", fontsize = 14)
fig.tight_layout(rect=[0, 0.03, 1, 0.95])

### Compare different models according to RMSE, MSE, MAE, and R2 Score

In [None]:
compare_regressor = pd.DataFrame(regressor_list, columns = ['Model'])
compare_regressor['rmse'] = rmse
compare_regressor['mse'] = mse
compare_regressor['mae'] = mae
compare_regressor['r2'] = r2
compare_regressor.sort_values(by = 'rmse', ascending = True)

#### It shows that XGBoost Regessor has the lowest RMSE score, the best model
#### Use Randomized Search Cv to tune the hyperparameters

In [None]:
xgb_reg = XGBRegressor()

params = {'max_depth': [3, 5, 6, 10, 15, 20],
         'learning_rate': [0.01, 0.1, 0.2, 0.3],
         'subsample': np.arange(0.5, 1.0, 0.1),
        'colsample_bytree': np.arange(0.4, 1.0, 0.1),
         'n_estimators': [100, 200, 300, 400, 500]}
rand_search = RandomizedSearchCV(estimator = xgb_reg, 
                                param_distributions = params,
                                scoring='neg_mean_squared_error',
                                n_iter = 25,
                                verbose = 1,
                                return_train_score = True)
search = rand_search.fit(X_train, y_train)
search

In [None]:
search.best_estimator_

In [None]:
search.best_estimator_.fit(X_train, y_train)
y_pred_xgb = search.best_estimator_.predict(X_test)

print('XGBoost Regressor')
print('rmse: {:.2f}, mse: {:.2f}, mae: {:.2f}, R2 score: {:.2f}'.format(np.sqrt(mean_squared_error(y_test, y_pred_xgb)),
                                                        mean_squared_error(y_test, y_pred_xgb),
                                                        mean_absolute_error(y_test, y_pred_xgb),
                                                        r2_score(y_test, y_pred_xgb)))