In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data = pd.read_csv('/kaggle/input/yeh-concret-data/Concrete_Data_Yeh.csv')
data.head()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
plt.figure(figsize=(8,7))
sns.heatmap(data.corr(),annot=True)
plt.show()

In [None]:
data.info()

In [None]:
data.isnull().sum()

#### Data has no null values

In [None]:
# Checking the pairplot.
sns.pairplot(data)
plt.show()

In [None]:
# Age columns seems as it is in the form of categories

In [None]:
data['age'].nunique()
# We'll proceed by using it as s numeric column only.

In [None]:
# checking the distribution of the data
for i in data.iloc[:,:-1].columns:
    sns.kdeplot(data[i])
    plt.show()

In [None]:
data.columns

In [None]:
for i in data.iloc[:,:-1].columns:
    print(i,end=' :')
    print(data[i].nunique())

In [None]:
# Checking for outliers in the data
for i in data.iloc[:,:-1].columns:
    sns.boxplot(data[i])
    plt.show()

In [None]:
data1 = data.copy(deep=True)

In [None]:
data1.isnull().sum()

# Outlier Treatment
for i in data1.iloc[:,:-1].columns:
    q1 = data1[i].quantile(0.25)
    q3 = data1[i].quantile(0.75)
    iqr = q3-q1
    ub = q3+(1.5*iqr)
    lb = q1-(1.5*iqr)
    data1[i]=data1[~((data1[i] < lb) | (data1[i] > ub))]

data1.isnull().sum()

for i in data1.columns:
    data1[i].fillna(method='ffill',inplace=True)

data1.isnull().sum()

for i in data1.iloc[:,:-1].columns:
    sns.boxplot(data1[i])
    plt.show()

for i in data1.iloc[:,:-1].columns:
    sns.kdeplot(data1[i])
    plt.show()

In [None]:
# Checking skewness of data now
data1.skew()

## Still skewness can be corrected using the Power Transformer

In [None]:
from sklearn.preprocessing import PowerTransformer

In [None]:
pt = PowerTransformer()
X = pt.fit_transform(data1.iloc[:,:-1])
X = pd.DataFrame(X,columns=data1.iloc[:,:-1].columns)
X.head()

In [None]:
X.skew()

In [None]:
Y = data[['csMPa']]

In [None]:
# Final Data prepared for models is
# Input Variables X and Target column Y.

In [None]:
import statsmodels.api as sm

In [None]:
inp = X
out = Y
c = sm.add_constant(inp)
model = sm.OLS(out,c).fit()
model.summary()

In [None]:
import scipy.stats as stats

In [None]:
stats.probplot(model.resid,plot=plt)

In [None]:
# Checking Normality of the residue
sns.distplot(model.resid)
plt.show()

# Data can be seen somewhat normal.

In [None]:
model.resid.skew()   # Skewness of residue is also under control.

In [None]:
# Checking Multicollinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
vif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(inp.values,i) for i in range(inp.shape[1])]
vif['Features'] = inp.columns
vif.sort_values('VIF',ascending=False)

In [None]:
# Multicollinearity is also under check.

In [None]:
from sklearn.model_selection import cross_val_score, train_test_split, KFold, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

In [None]:
# Splitting the dataset
xtrain, xtest, ytrain, ytest = train_test_split(X,Y,test_size=0.25,random_state=0)

In [None]:
LR = LinearRegression()
LR.fit(xtrain,ytrain)

In [None]:
LR.score(xtrain,ytrain)

In [None]:
LR.score(xtest,ytest)

In [None]:
ytrain_pred = LR.predict(xtrain)
ytest_pred = LR.predict(xtest)

r2_train = r2_score(ytrain,ytrain_pred)
r2_test = r2_score(ytest,ytest_pred)

rmse_train = np.sqrt(mean_squared_error(ytrain,ytrain_pred))
rmse_test = np.sqrt(mean_squared_error(ytest,ytest_pred))

print('R2 Train: ',r2_train,end='    ')
print('R2 Test: ', r2_test)
print('RMSE Train: ',rmse_train,end='    ')
print('RMSE Test: ',rmse_test)

In [None]:
kf = KFold(shuffle=True,n_splits=5,random_state=0)
score1 = cross_val_score(LR,xtrain,ytrain,cv=kf,scoring='r2')
LR_be = np.mean(1-score1)
LR_ve = np.std(score1,ddof=1)
print('Bias Error: ',LR_be)
print('Variance Error: ',LR_ve)

In [None]:
Tab = pd.DataFrame()

In [None]:
Tab['LR'] = [LR_be,LR_ve]

In [None]:
Tab

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
KNR = KNeighborsRegressor()
KNR.fit(xtrain,ytrain)

In [None]:
KNR.score(xtrain,ytrain)

In [None]:
KNR.score(xtest,ytest)

In [None]:
ytrain_pred = KNR.predict(xtrain)
ytest_pred = KNR.predict(xtest)

r2_train = r2_score(ytrain,ytrain_pred)
r2_test = r2_score(ytest,ytest_pred)

rmse_train = np.sqrt(mean_squared_error(ytrain,ytrain_pred))
rmse_test = np.sqrt(mean_squared_error(ytest,ytest_pred))

print('R2 Train: ',r2_train,end='    ')
print('R2 Test: ', r2_test)
print('RMSE Train: ',rmse_train,end='    ')
print('RMSE Test: ',rmse_test)

In [None]:
# The model is highly Over fitting

In [None]:
kf = KFold(shuffle=True,n_splits=5,random_state=0)
score2 = cross_val_score(KNR,xtrain,ytrain,cv=kf,scoring='r2')
KNR_be = np.mean(1-score2)
KNR_ve = np.std(score2,ddof=1)
print('Bias Error: ',KNR_be)
print('Variance Error: ',KNR_ve)

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
RF = RandomForestRegressor()
RF.fit(xtrain,ytrain)

In [None]:
RF.score(xtrain,ytrain)

In [None]:
RF.score(xtest,ytest)

In [None]:
ytrain_pred = RF.predict(xtrain)
ytest_pred = RF.predict(xtest)

r2_train = r2_score(ytrain,ytrain_pred)
r2_test = r2_score(ytest,ytest_pred)

rmse_train = np.sqrt(mean_squared_error(ytrain,ytrain_pred))
rmse_test = np.sqrt(mean_squared_error(ytest,ytest_pred))

print('R2 Train: ',r2_train,end='    ')
print('R2 Test: ', r2_test)
print('RMSE Train: ',rmse_train,end='    ')
print('RMSE Test: ',rmse_test)

In [None]:
kf = KFold(shuffle=True,n_splits=5,random_state=0)
score3 = cross_val_score(RF,xtrain,ytrain,cv=kf,scoring='r2')
RF_be = np.mean(1-score3)
RF_ve = np.std(score3,ddof=1)
print('Bias Error: ',RF_be)
print('Variance Error: ',RF_ve)

In [None]:
from xgboost import XGBRegressor

In [None]:
XGB = XGBRegressor(random_state=0)
XGB.fit(xtrain,ytrain)

In [None]:
XGB.score(xtrain,ytrain)

In [None]:
XGB.score(xtest,ytest)

In [None]:
ytrain_pred = XGB.predict(xtrain)
ytest_pred = XGB.predict(xtest)

r2_train = r2_score(ytrain,ytrain_pred)
r2_test = r2_score(ytest,ytest_pred)

rmse_train = np.sqrt(mean_squared_error(ytrain,ytrain_pred))
rmse_test = np.sqrt(mean_squared_error(ytest,ytest_pred))

print('R2 Train: ',r2_train,end='    ')
print('R2 Test: ', r2_test)
print('RMSE Train: ',rmse_train,end='    ')
print('RMSE Test: ',rmse_test)

In [None]:
# Applyting Regularization

In [None]:
from sklearn.linear_model import Ridge, Lasso

In [None]:
# Searching for the best parameter.
rid = Ridge()
param = {'alpha':[0.0001,0.00,0.01,0.1,0.5,1,2,5,10]}
GS = GridSearchCV(rid,param,cv=5, scoring='neg_mean_squared_error')
model1 = GS.fit(xtrain,ytrain)

In [None]:
model1.best_params_

In [None]:
rid = Ridge(alpha=5)
rid.fit(xtrain,ytrain)

In [None]:
ytrain_pred = rid.predict(xtrain)
ytest_pred = rid.predict(xtest)

r2_train = r2_score(ytrain,ytrain_pred)
r2_test = r2_score(ytest,ytest_pred)

rmse_train = np.sqrt(mean_squared_error(ytrain,ytrain_pred))
rmse_test = np.sqrt(mean_squared_error(ytest,ytest_pred))

print('R2 Train: ',r2_train,end='    ')
print('R2 Test: ', r2_test)
print('RMSE Train: ',rmse_train,end='    ')
print('RMSE Test: ',rmse_test)

In [None]:
# No imporvement at all.