# Packages 

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
sns.set(color_codes = True)
import matplotlib.pyplot as plt
%matplotlib inline 
import warnings 
warnings.filterwarnings("ignore")

# EDA || Deliverable 1

In [None]:
data = pd.read_csv("concrete.csv")

In [None]:
print(data.head(10))
data.info()
data.isna().sum() 

# Observation and inference:

# All numerical varaibles 
# no  missing  values in the data 
# zeros are the cases where the value of that ingredient is actually zero 

In [None]:
# 5 point summary statistics  
data.describe().T

In [None]:
# observations:
"""
cement > no outlier as max is under 75th percentle+ 1.5 IQR
slag > high variance and seems outliers 
Ash >  high variance
water > seems outliers  
superplastic > high variance and seems outliers 
coarseagg > seems like a normal distribution
fineagg > normal like distribution but outliers 
Age > high variance and outliers 
Strength > normal distribution 
"""

In [None]:
# checking above observations 
# outliers in data 
data.plot(kind="box", figsize =[15,8])

In [None]:
# except cement , Ash and coarseagg all variables seems to have outliers as observed intially 
# Checking no of outliers in the data 
q3 = data.quantile(0.75)
q1 = data.quantile(0.25)
iqr = q3-q1
out = ((data.iloc[:]<(q1-1.5*iqr))|(data.iloc[:]>(q3+1.5*iqr))).sum(axis=0)
out_df = pd.DataFrame(out,index=data.columns,columns=["No of outliers"])
out_df['Percentage Outliers'] = round(out_df["No of outliers"]*100/len(data),2)
out_df["Percentage Outliers"].sum() # 8.64% data in outlier 
out_df


### Univariae Analysis 

In [None]:
#Density distribution of variables 
plt.subplots(figsize = [18,8])
ax= sns.kdeplot(data= data)

In [None]:
col_names = data.columns
from scipy.stats import zscore
scaled_data = data.apply(zscore)
f,ax = plt.subplots(nrows=3,ncols=3,figsize =(22,12))
for i,ax,j in zip(scaled_data.columns,ax.flatten(),col_names):
    sns.distplot(scaled_data[i],ax=ax,label=j,rug=True)
    ax.axvline(x=scaled_data[i].mean(),color="green")
    ax.axvline(x=scaled_data[i].median(),color="red")
    ax.text(x=scaled_data[i].mean(),y=0.5,s="Mean-Green \n Median-Red")
    

In [None]:
# observations :

"""
cement > near normal distribution, mean ahead of median to right side,right skewed. 

Slag> multi modal distribution, mean is ahead of median to right side,right skewed  .

Ash > multimode distribution, highly right skeweed  and no outliers as we already oserved. 

Water > not a normal distribution, seems left skewed. 

Superplastic> multi gaussian distribution with one high peaked and one low,outliers and skewness. 

coarseagg > not a normal distribution , mean and median coincide could be slightly skewed. 

fineagg > near normal distiribution,mean and median almost equal.

Age > Outliers present in the variable and distribution is right skewed. 

Strength>  seems to be normally distributed 
"""
# Inference : 
"""
slag, Ash and superplastic seems multi gausians, will perform clustering for further analysis,suspect 2-6 clusters 

there is skewness and outlier in the predictor variables.
"""

In [None]:
# checking the skewness based on above observation 
data.skew()
# Age is highly right skewed.
# slag and superplastic are right skewed. 
# cement, Ash and strenght are slightly right skewed 
# fine and coarseagg are slightly left skewed 
# expected water to be left skewed however it is slightly right skewed. 

In [None]:
# pair plot to check the dependency or correlation among the predictor variables 
from scipy.stats import zscore
scaled_data = pd.DataFrame(data=data.apply(zscore),columns=list(data.columns))
sns.pairplot(scaled_data,diag_kind="kde")

In [None]:
# observations: 
"""
cement is distributed in groups across age.
cement seems to be in an independent/non linear relationship wiht : fineagg, coarseagg, water,superplastic, ash, slag  
cement seems to have strong correlation with strength.

slag is distributed in groups across age.
Slag seems to be divided into groups for each variable , one group is linear and the other one is a cloud
slag vs strength > is a could, could be a weak prodictor 

Ash seems to be divided in groups for each variable, one group is linear where as other is a cloud
Ash vs Strength > 2 groups for strength one is lienar and other is a cloud

coarseagg and fineagg vs strength is a cloud, could be weak predictors 

water seems to be in an range bound cloud disribution with all variables 
water vs Strength > seems to be range bound (spread across a particular range) 

superplastic distribution is divided into groups, one group is constant values across all variable while the other one is a cloud
superplastic vs strength > superplastic seems to be positively correlated with strength 

All variables are distributed in age groups 
Age vs strength > strength is maximum for a particular age range 

# Target Variable 

Strength is more for a lower age range.
slag and water coarse and fineagg are forming a cloud with strength so does 
Age and ash is dirtibuted in groups for different strength 
"""

In [None]:
# Based on above observation checking the correlation in the data 
corr = data.corr()
plt.subplots(figsize=(12,7))
sns.heatmap(corr,annot=True,cmap="YlOrRd")

In [None]:
corr_sorted =corr.unstack().sort_values(kind="quicksort",ascending=False)
print(corr_sorted[corr_sorted!=1].head(10))
print(corr_sorted[corr_sorted!=1].tail(10))

# Strong Negative correlation between superplastic and water
# strong positive correlation between strength and cement

In [None]:
"""
up till now we see strength is correlated with cement, cement seems to be a strog predictor.  
No evidence yet but Age seems to be strong factor in concrete strength.
superplastic and water are negatively correlated. 
slag, Ash and superplastic are multi gaussians.

there is skewness and outlier in the predictor variables.

Outlier Treatment > will test imputing outliers with mean , median and (10,90||25,75) percentile combination.
"""

### Multi variate analysis 

In [None]:
# strength vs Age, water and cement 
plt.subplots(figsize=[20,12])
sns.scatterplot(x= data.cement,y=data.strength,hue=data.water,size=data.age,sizes=(100,500),palette="rocket_r")
fig =plt.figure(figsize=[20,12])
ax=plt.axes(projection="3d")
ax.scatter3D(xs=data.cement,ys=data.strength,zs=data.water,c=data.age,cmap="rocket_r")
ax.set_xlabel("cement")
ax.set_ylabel("strength")
ax.set_zlabel('water')
ax.view_init(15,120)

# Samples with lower water quantity have higher strength. 
# All high water samples have a restricted strength to  50
# All high strength samples are of lower age than 80  
# majority of the data lies in mean+-2 std i.e strength between 3 to 67 Mpa
# as the water is increasing the strength is decreasing and maximum stregth is from samples having less water 
# very few samples with high strength, and all these smaples are below  age 80 
# all samples above age 160 days are below strength 60. 
# cement quantity for maximum strength lies in the range 200 -500 

In [None]:
# strength vs superplastic and Age 
plt.subplots(figsize=(20,12))
sns.scatterplot(x=data.superplastic,y=data.strength,hue=data.age,size=data.age,sizes=(100,500),palette="rocket_r");
fig =plt.figure(figsize=[20,12])
ax=plt.axes(projection="3d")
ax.scatter3D(xs=data.superplastic,ys=data.strength,zs=data.age,c=data.age,cmap="rocket_r")
ax.set_xlabel("Superplastic")
ax.set_ylabel("strength")
ax.set_zlabel('Age')
ax.view_init(15,120)

# for one group superplastic is contanst for different age and strength rises linearly 
# for other group higher strength is for age less than 80 and superplastic in range 5-15
# superplastic seems not to be a good predictor

In [None]:
# strength vs slag and age 
plt.subplots(figsize=(20,12))
sns.scatterplot(x=data.slag,y=data.strength,hue=data.age,size=data.age,sizes=(100,500),palette="rocket_r");

fig =plt.figure(figsize=[20,12])
ax=plt.axes(projection="3d")
ax.scatter3D(xs=data.slag,ys=data.strength,zs=data.age,c=data.age,cmap="rocket_r")
ax.set_xlabel("slag")
ax.set_ylabel("strength")
ax.set_zlabel('Age')
ax.view_init(15,120)

# As slag is multiguassian, one group is constant and in the other group 
# higher strength samples have less age, i.e. below 80 and higher age samples have strength restricted to 60
# does not seems to be strong predictor 

In [None]:
# strength vs Age and Ash 
fig =plt.figure(figsize=[20,12])
ax=plt.axes(projection="3d")
ax.scatter3D(xs=data.age,ys=data.strength,zs=data.ash,c=data.age,cmap="rocket_r")
ax.set_xlabel("Age")
ax.set_ylabel("strength")
ax.set_zlabel('Ash')
ax.view_init(15,120)

# Data is divided in groups: 
# higher strength samples have less age, i.e. below 80 and higher age samples have strength restricted to 60
# does not seems to be strong predictor 

In [None]:
# strength vs Age and water
fig = plt.figure(figsize=[15,110])
ax = plt.axes(projection="3d")
ax.scatter3D(data.age,data.strength,data.water,c=data.age,cmap="rocket_r")
ax.set_xlabel("Age")
ax.set_ylabel("Strength")
ax.set_zlabel("Water")
ax.view_init(15,100)


In [None]:
# strength vs superplastic, age and water 
fig = plt.figure(figsize=(15,10))
ax = plt.axes(projection="3d")
ax.scatter3D(data.water,data.strength,data.superplastic,c=data.age,cmap="PuBu_r")
ax.set_xlabel('water')
ax.set_ylabel('strength')
ax.set_zlabel("superplastic")
ax.view_init(10,120)
plt.show

### Outlier Treatment method testing

In [None]:
# making a copy of the data to test the best method for outlier treatment 
data2 = data.copy()

In [None]:
# outlier treatment / imputation 
# choosing Age first as it has the most outlier 

# Iteration 1 : imputing it with upper and lower whisker value 
Age_1 = data2.age
iqr = Age_1.quantile(0.75)-Age_1.quantile(0.25)
upp_whs = Age_1.quantile(0.75)+1.5*iqr
low_whs = Age_1.quantile(0.25)-1.5*iqr 
Age_1 = np.array(Age_1)
upp_whs_dp = np.where(Age_1>upp_whs)
low_whs_dp = np.where(Age_1<low_whs)
Age_1[upp_whs_dp] = pd.Series(Age_1).quantile(0.75)
Age_1[low_whs_dp]=pd.Series(Age_1).quantile(0.25)

# Iteration 2 : imputing with mean 
Age_2 = data2.age
iqr = Age_2.quantile(0.75)-Age_2.quantile(0.25)
upp_whs = Age_2.quantile(0.75)+1.5*iqr
low_whs = Age_2.quantile(0.25)-1.5*iqr 
Age_2 = np.array(Age_2)
upp_whs_dp = np.where(Age_2>upp_whs)
low_whs_dp = np.where(Age_2<low_whs)
Age_2[upp_whs_dp] = pd.Series(Age_2).mean()
Age_2[low_whs_dp]=pd.Series(Age_2).mean()

# Iteration 3 : imputing with median 
Age_3 = data2.age
iqr = Age_3.quantile(0.75)-Age_3.quantile(0.25)
upp_whs = Age_3.quantile(0.75)+1.5*iqr
low_whs = Age_3.quantile(0.25)-1.5*iqr 
Age_3 = np.array(Age_3)
upp_whs_dp = np.where(Age_3>upp_whs)
low_whs_dp = np.where(Age_3<low_whs)
Age_3[upp_whs_dp] = pd.Series(Age_3).median()
Age_3[low_whs_dp]=pd.Series(Age_3).median()

# Distribution Plot of the variable
f,((ax1,ax2,ax3),(ax4,ax5,ax6),(ax7,ax8,ax9),(ax10,ax11,ax12))=plt.subplots(nrows=4,ncols=3,figsize = [17,14])
# Raw Data distribution 
sns.kdeplot(data.age,ax=ax1)
ax1.axvline(x=data.age.mean(),color="green")
ax1.axvline(x=data.age.median(),color="red")
ax1.axvline(x=0,color="orange")
sns.boxplot(data.age,ax=ax2)
sns.histplot(data.age,ax=ax3)
ax3.axvline(x=data.age.mean(),color="green")
ax3.axvline(x=data.age.median(),color="red")
ax3.axvline(x=0,color="orange")

# Age after outlier treatment with quantiles(75,25)

sns.kdeplot(Age_1,ax=ax4,legend="Quantile imputing")
ax4.axvline(x=Age_1.mean(),color="green")
ax4.axvline(x=pd.Series(Age_1).median(),color="red")
ax4.axvline(x=0,color="orange")
sns.boxplot(Age_1,ax=ax5)
sns.histplot(Age_1,ax=ax6)
ax6.axvline(x=Age_1.mean(),color="green")
ax6.axvline(x=pd.Series(Age_1).median(),color="red")
ax6.axvline(x=0,color="orange")
#plt.xlabel("Age after outlier treatment with quantiles(75,25) ")
plt.legend()

# Age after outlier treatment with mean value 

sns.kdeplot(Age_2,ax=ax7)
ax7.axvline(x=Age_2.mean(),color="green")
ax7.axvline(x=pd.Series(Age_2).median(),color="red")
ax7.axvline(x=0,color="orange")
sns.boxplot(Age_2,ax=ax8)
sns.histplot(Age_2,ax=ax9)
ax9.axvline(x=Age_2.mean(),color="green")
ax9.axvline(x=pd.Series(Age_2).median(),color="red")
ax9.axvline(x=0,color="orange")
#plt.xlabel("Age after outlier treatment with mean value ")

# Age after outlie treatment with median value 

sns.kdeplot(Age_3,ax=ax10)
ax10.axvline(x=Age_3.mean(),color="green")
ax10.axvline(x=pd.Series(Age_3).median(),color="red")
ax10.axvline(x=0,color="orange")
sns.boxplot(Age_3,ax=ax11)
sns.histplot(Age_3,ax=ax12)
ax12.axvline(x=Age_3.mean(),color="green")
ax12.axvline(x=pd.Series(Age_3).median(),color="red")
ax12.axvline(x=0,color="orange")

plt.tight_layout()

# Infrences 

# Iteration 1 : tried replacing upper ouliers with 75th percentile and lower with 25th percentile, multimodes are generated however
# outliers are treated by this particluar method for Age. 
# Iteration 2 : replacing with mean value increased the outlier, it required second iteration and 
# is creating more outliers and peaking the distribution. 
# Iteration 3 : similar behavious as that of iteration 2

#Below Plots 
# row 1: raw data distribution 
# row 2: Distribution after Imputing outlier values with quantile values (75 and 25) for upper and lower outliers respectively
# row 3: Distribution after imputing outlier values with mean value (2 iterations performed to eliminate the outlier)
# row 4: Distribution after imputing outlier with median value 


In [None]:
# superplastic 

# Iteration 1: imputing with mean 9.2 , median 9.4
superplastic = data2.superplastic
print("standard deviation raw", superplastic.std())
iqr = superplastic.quantile(0.75)-superplastic.quantile(0.25)
upp_wsh = superplastic.quantile(0.75)+1.5*iqr
print("upper whisker value superplastic", upp_wsh)
low_wsh = superplastic.quantile(0.25)-1.5*iqr
print("lower whisker value superplastic", low_wsh)
superplastic = np.array(superplastic)
upp_wsh_dp = np.where(superplastic>upp_wsh)
print("upper whisker data points", superplastic[upp_wsh_dp].shape)
low_wsh_dp = np.where(superplastic<low_wsh)
print("lower whisker data points", superplastic[low_wsh_dp].shape)

# imputation 
superplastic[upp_wsh_dp] =pd.Series(superplastic).mean() 
superplastic[low_whs_dp]= pd.Series(superplastic).mean()
print("standard deviation after iteration 2 ",superplastic.std())


# Iteration 2: imputing with 75th percentile and 25th percentile 
superplastic_quant = data2.superplastic
print("standard deviation raw", superplastic_quant.std())
iqr = superplastic_quant.quantile(0.75)-superplastic_quant.quantile(0.25)
upp_wsh = superplastic_quant.quantile(0.75)+1.5*iqr
print("upper whisker value superplastic_quant", upp_wsh)
low_wsh = superplastic_quant.quantile(0.25)-1.5*iqr
print("lower whisker value superplastic_quant", low_wsh)
superplastic_quant = np.array(superplastic_quant)
upp_wsh_dp = np.where(superplastic_quant>upp_wsh)
print("upper whisker data points", superplastic_quant[upp_wsh_dp].shape)
low_wsh_dp = np.where(superplastic_quant<low_wsh)
print("lower whisker data points", superplastic_quant[low_wsh_dp].shape)

# imputation 
print("75th percentile superplastic", pd.Series(superplastic_quant).quantile(0.75))
print("25th percentile superplastic", pd.Series(superplastic_quant).quantile(0.25))
superplastic_quant[upp_wsh_dp] = pd.Series(superplastic_quant).quantile(0.75) 
superplastic_quant[low_wsh_dp]= pd.Series(superplastic_quant).quantile(0.25)
print("standard deviation after Iteration 2 ",superplastic_quant.std())

# Raw distibution of superplastic 
f,((ax1,ax2),(ax3,ax4),(ax5,ax6)) = plt.subplots(nrows=3,ncols=2,figsize=[17,15])
sns.kdeplot(data.superplastic,ax=ax1)
ax1.axvline(x=data.superplastic.mean(),color="red")
ax1.axvline(x=data.superplastic.median(),color="green")
sns.boxplot(data.superplastic,ax=ax2)

# distribution after treating outliers iteration 1 (mean ) 
sns.kdeplot(superplastic,ax=ax3)
ax3.axvline(x=superplastic.mean(),color="red")
ax3.axvline(x=pd.Series(superplastic).median(),color="green")
sns.boxplot(superplastic,ax=ax4)
plt.xlabel("Superplalstic after treating outliers iteration 1")

# distribution after treating outliers iteration 2 (25,75 quantile)
sns.kdeplot(superplastic_quant,ax=ax5)
ax5.axvline(x=superplastic_quant.mean(),color="red")
ax5.axvline(x=pd.Series(superplastic_quant).median(),color="green")
sns.boxplot(superplastic_quant,ax=ax6)
plt.xlabel("Superplalstic after treating outliers iteration 2 ")
plt.show()


# outliers are removed, by both the methods 
# Standard deviation is reduced when imputed, however its less reduced with quantile imputing  

In [None]:
# water outlier treatment 
outlier_up = data.water.loc[data.water>data.water.quantile(0.75)+1.5*(data.water.quantile(0.75)-data.water.quantile(0.25))]
outlier_low = data.water.loc[data.water<data.water.quantile(0.25)-1.5*(data.water.quantile(0.75)-data.water.quantile(0.25))]
print("upper whisker value",data.water.quantile(0.75)+1.5*(data.water.quantile(0.75)-data.water.quantile(0.25)))
print("lower whisker value",data.water.quantile(0.25)-1.5*(data.water.quantile(0.75)-data.water.quantile(0.25)))
print("Upper outliers water", outlier_up)
print("lower outliers water",outlier_low)
print("std raw distribution",data.water.std())

# Iteration 1: imputing with 75th and 25th quantile 
water_1 = data2.water
water_1[outlier_up.index] = water_1.quantile(0.75)
water_1[outlier_low.index] = water_1.quantile(0.25)
print("std iteration 1", water_1.std())

# Iteration 2: imputing with mean 
water_2 = data2.water
water_2[outlier_up.index] = water_2.mean()
water_2[outlier_low.index] = water_2.mean()
print("std iteration 2", water_2.std())

# iteration3 : imputing with median 

water_3 = data2.water
water_3[outlier_up.index] = water_3.median()
water_3[outlier_low.index] = water_3.median()
print("std iteration 3", water_3.std())

# water > normal distribution but outliers 
f,((ax1,ax2),(ax3,ax4),(ax5,ax6),(ax7,ax8)) = plt.subplots(nrows=4,ncols=2,figsize=[25,15])
sns.kdeplot(x=data.water,ax=ax1)
ax1.axvline(x=data.water.mean(),color="red")
ax1.axvline(x=data.water.median(),color="green")
sns.boxplot(data.water,ax=ax2)

# distribution after outlier treatment iteration 1 quantile imputing 
sns.kdeplot(water_1,ax=ax3)
ax3.axvline(x=water_1.mean(),color="red")
ax3.axvline(x=water_1.median(),color="green")
sns.boxplot(water_1,ax=ax4)

# distribution after outlier treatment iteration 2 mean imputing
sns.kdeplot(water_2,ax=ax5)
ax5.axvline(x=water_2.mean(),color="red")
ax5.axvline(x=water_2.median(),color="green")
sns.boxplot(water_2,ax=ax6)

# distribution after outlier treatment iteration 1  median imputing
sns.kdeplot(water_3,ax=ax7)
ax7.axvline(x=water_3.mean(),color="red")
ax7.axvline(x=water_3.median(),color="green")
sns.boxplot(water_3,ax=ax8)


# distribution after outlier treatment does not differ from original raw data distribution 
# outliers are treated with both the method, however std is reduced less with quantile imputing 

In [None]:
# slag outlier treatment 
outlier_up = data.slag.loc[data.slag>data.slag.quantile(0.75)+1.5*(data.slag.quantile(0.75)-data.slag.quantile(0.25))]
outlier_low = data.slag.loc[data.slag<data.slag.quantile(0.25)-1.5*(data.slag.quantile(0.75)-data.slag.quantile(0.25))]
print("upper whisker value",data.slag.quantile(0.75)+1.5*(data.slag.quantile(0.75)-data.slag.quantile(0.25)))
print("lower whisker value",data.slag.quantile(0.25)-1.5*(data.slag.quantile(0.75)-data.slag.quantile(0.25)))
print("Upper outliers slag", outlier_up)
print("lower outliers slag",outlier_low)
print("std raw distribution",data.slag.std())

# Iteration 1: imputing with 75th and 25th quantile 
slag_1 = data2.slag
slag_1[outlier_up.index] = slag_1.quantile(0.75)
slag_1[outlier_low.index] = slag_1.quantile(0.25)
print("std iteration 1", slag_1.std())

# Iteration 2: imputing with mean 
slag_2 = data2.slag
slag_2[outlier_up.index] = slag_2.mean()
slag_2[outlier_low.index] = slag_2.mean()
print("std iteration 2", slag_2.std())

# iteration3 : imputing with median 

slag_3 = data2.slag
slag_3[outlier_up.index] = slag_3.median()
slag_3[outlier_low.index] = slag_3.median()
print("std iteration 3", slag_3.std())

# slag > normal distribution but outliers 
f,((ax1,ax2),(ax3,ax4),(ax5,ax6),(ax7,ax8)) = plt.subplots(nrows=4,ncols=2,figsize=[25,15])
sns.kdeplot(x=data.slag,ax=ax1)
ax1.axvline(x=data.slag.mean(),color="red")
ax1.axvline(x=data.slag.median(),color="green")
sns.boxplot(data.slag,ax=ax2)

# distribution after outlier treatment iteration 1 quantile imputing 
sns.kdeplot(slag_1,ax=ax3)
ax3.axvline(x=slag_1.mean(),color="red")
ax3.axvline(x=slag_1.median(),color="green")
sns.boxplot(slag_1,ax=ax4)

# distribution after outlier treatment iteration 2 mean imputing
sns.kdeplot(slag_2,ax=ax5)
ax5.axvline(x=slag_2.mean(),color="red")
ax5.axvline(x=slag_2.median(),color="green")
sns.boxplot(slag_2,ax=ax6)

# distribution after outlier treatment iteration 1  median imputing
sns.kdeplot(slag_3,ax=ax7)
ax7.axvline(x=slag_3.mean(),color="red")
ax7.axvline(x=slag_3.median(),color="green")
sns.boxplot(slag_3,ax=ax8)


In [None]:
# fineagg 
outlier_up = data.fineagg.loc[data.fineagg>data.fineagg.quantile(0.75)+1.5*(data.fineagg.quantile(0.75)-data.fineagg.quantile(0.25))]
outlier_low = data.fineagg.loc[data.fineagg<data.fineagg.quantile(0.25)-1.5*(data.fineagg.quantile(0.75)-data.fineagg.quantile(0.25))]
print("upper whisker value",data.fineagg.quantile(0.75)+1.5*(data.fineagg.quantile(0.75)-data.fineagg.quantile(0.25)))
print("lower whisker value",data.fineagg.quantile(0.25)-1.5*(data.fineagg.quantile(0.75)-data.fineagg.quantile(0.25)))
print("Upper outliers fineagg", outlier_up)
print("lower outliers fineagg",outlier_low)
print("std raw distribution",data.fineagg.std())
sns.boxplot(data.fineagg)

In [None]:
data.fineagg.describe()

In [None]:
# Iteration 1: imputing with 75th and 25th quantile 
fineagg_1 = data2.fineagg
fineagg_1[outlier_up.index] = fineagg_1.quantile(0.90)
fineagg_1[outlier_low.index] = fineagg_1.quantile(0.10)
print("std iteration 1", fineagg_1.std())

# Iteration 2: imputing with mean 
fineagg_2 = data2.fineagg
fineagg_2[outlier_up.index] = fineagg_2.mean()
fineagg_2[outlier_low.index] = fineagg_2.mean()
print("std iteration 2", fineagg_2.std())

# iteration3 : imputing with median 

fineagg_3 = data2.fineagg
fineagg_3[outlier_up.index] = fineagg_3.median()
fineagg_3[outlier_low.index] = fineagg_3.median()
print("std iteration 3", fineagg_3.std())

In [None]:
# fineagg > normal distribution but outliers 
f,((ax1,ax2),(ax3,ax4),(ax5,ax6),(ax7,ax8)) = plt.subplots(nrows=4,ncols=2,figsize=[25,15])
sns.kdeplot(x=data.fineagg,ax=ax1)
ax1.axvline(x=data.fineagg.mean(),color="red")
ax1.axvline(x=data.fineagg.median(),color="green")
sns.boxplot(data.fineagg,ax=ax2)

# distribution after outlier treatment iteration 1 quantile imputing 
sns.kdeplot(fineagg_1,ax=ax3)
ax3.axvline(x=fineagg_1.mean(),color="red")
ax3.axvline(x=fineagg_1.median(),color="green")
sns.boxplot(fineagg_1,ax=ax4)

# distribution after outlier treatment iteration 2 mean imputing
sns.kdeplot(fineagg_2,ax=ax5)
ax5.axvline(x=fineagg_2.mean(),color="red")
ax5.axvline(x=fineagg_2.median(),color="green")
sns.boxplot(fineagg_2,ax=ax6)

# distribution after outlier treatment iteration 1  median imputing
sns.kdeplot(fineagg_3,ax=ax7)
ax7.axvline(x=fineagg_3.mean(),color="red")
ax7.axvline(x=fineagg_3.median(),color="green")
sns.boxplot(fineagg_3,ax=ax8)

# when imputing with quantile and mean/median, more outliers are generated i.e. the new outliers are previous minimum data points 
# as with imputation std is reduced hence new points are outliers now.

In [None]:
# strength 
outlier_up = data.strength.loc[data.strength>data.strength.quantile(0.75)+1.5*(data.strength.quantile(0.75)-data.strength.quantile(0.25))]
outlier_low = data.strength.loc[data.strength<data.strength.quantile(0.25)-1.5*(data.strength.quantile(0.75)-data.strength.quantile(0.25))]
print("upper whisker value",data.strength.quantile(0.75)+1.5*(data.strength.quantile(0.75)-data.strength.quantile(0.25)))
print("lower whisker value",data.strength.quantile(0.25)-1.5*(data.strength.quantile(0.75)-data.strength.quantile(0.25)))
print("Upper outliers strength", outlier_up)
print("lower outliers strength",outlier_low)
print("std raw distribution",data.strength.std())

# Iteration 1: imputing with 75th and 25th quantile 
strength_1 = data2.strength
strength_1[outlier_up.index] = strength_1.quantile(0.75)
strength_1[outlier_low.index] = strength_1.quantile(0.25)
print("std iteration 1", strength_1.std())

# Iteration 2: imputing with mean 
strength_2 = data2.strength
strength_2[outlier_up.index] = strength_2.mean()
strength_2[outlier_low.index] = strength_2.mean()
print("std iteration 2", strength_2.std())

# iteration3 : imputing with median 

strength_3 = data2.strength
strength_3[outlier_up.index] = strength_3.median()
strength_3[outlier_low.index] = strength_3.median()
print("std iteration 3", strength_3.std())

# strength > normal distribution but outliers 
f,((ax1,ax2),(ax3,ax4),(ax5,ax6),(ax7,ax8)) = plt.subplots(nrows=4,ncols=2,figsize=[25,15])
sns.kdeplot(x=data.strength,ax=ax1)
ax1.axvline(x=data.strength.mean(),color="red")
ax1.axvline(x=data.strength.median(),color="green")
sns.boxplot(data.strength,ax=ax2)

# distribution after outlier treatment iteration 1 quantile imputing 
sns.kdeplot(strength_1,ax=ax3)
ax3.axvline(x=strength_1.mean(),color="red")
ax3.axvline(x=strength_1.median(),color="green")
sns.boxplot(strength_1,ax=ax4)

# distribution after outlier treatment iteration 2 mean imputing
sns.kdeplot(strength_2,ax=ax5)
ax5.axvline(x=strength_2.mean(),color="red")
ax5.axvline(x=strength_2.median(),color="green")
sns.boxplot(strength_2,ax=ax6)

# distribution after outlier treatment iteration 1  median imputing
sns.kdeplot(strength_3,ax=ax7)
ax7.axvline(x=strength_3.mean(),color="red")
ax7.axvline(x=strength_3.median(),color="green")
sns.boxplot(strength_3,ax=ax8)

# new data points are emerging out of with both the types of imputation 

In [None]:
# Methods are working for different variables as listed below 
'''
Age > qunatile imputing worked, while mean and median generates more outliers 

water >  both methods worked

superplastic > both method worked 

slag > both  method worked 

fineagg > with both method lower outlier is genrated and these new outlier value are closer to mean hence not imputing further  

target variabele

strength > not all outliers are eliminated by both the methods with new data points becoming outliers now.  

choosing 25th and 75th percentile for imputation. 

'''

In [None]:
# outlier impuation 
data3 = data.copy()
data3.head()

In [None]:
data3.describe().T

In [None]:
data3.plot(kind="box",figsize=(12,8))

In [None]:
# imputing the upper and lower outlier with 75th and 25th Quantile respectivly 
col_names = list(data3.columns)
for i in col_names:
    q3 = data3[i].quantile(0.75)
    q1 = data3[i].quantile(0.25)
    iqr = q3-q1
    low_out = data3[i].loc[data3[i]<(q1-1.5*iqr)]
    up_out= data3[i].loc[data3[i]>(q3+1.5*iqr)]
    data3[i][up_out.index] = q3
    if len(low_out) != 0 :
        data3[i][low_out.index]=q1

In [None]:
data3.plot(kind="box", figsize=(12,8))

In [None]:
col_names = data3.columns
f,ax = plt.subplots(nrows=3,ncols=3,figsize =(22,12))
for i,ax,j in zip(data3.columns,ax.flatten(),col_names):
    sns.boxplot(data3[i],ax=ax)
    # ax.text(x=data[i].mean(),y=0.001,s=j)

In [None]:
# all outliers seem treated, distribution after treatment aldready discussed above 


# Feature Engineering || Deliverable 2 

# Refrences 

https://link.springer.com/referenceworkentry/10.1007%2F978-1-4419-0851-3_121

https://buildingresearch.com.np/services/ct/ct2.php

https://theconstructor.org/concrete/compressive-strength-concrete-cube-test/1561/

https://en.wikipedia.org/wiki/Compressive_strength
    

### understanding after EDA : 

Strong strength predictors> cement , Age, seems strong predictors 

cement >>> cement is strong predictor cement quantity for maximum strength lies in the range 200 -500

water >>> Samples with lower water quantity have higher strength, all high water samples have a restricted strength to  50

Age >>> higher strength is for age less than 80 very few samples with high strength, and all these smaples are below  age 80 
all samples above age 160 days are below strength 60. 



coarseagg, fineagg, superplasic, slag, ash

coarseagg and fineagg vs strength is a cloud, could be weak predictors 

slag, Ash and superplastic are multi gaussians

slag vs strength > is a could, could be a weak prodictor 

Ash vs Strength > 2 groups for strength one is lienar and other is a cloud
### -----------------------------------------------------------------------------------------------------------------------------

compressive strength of a cement : compressive strength resists being pushed together.

compressive strength of a cement depends upon below factors based on below refrences 

water / cement ratio

cement / sand ratio

Type and grading of sand 

manner of mixing 

size and shape of specimen

Age of the specimen/ sample, cement gains strength over time ( we already saw above) 


### Addition of features 

based on above factors, creating 2 more features as per the given data i.e. 

water/cement ration and cement to sand ratio which in our data set will be cement/fineagg 


strength is inversely proportional to water/cement ration lower the ration more the strength i.e. more the cement

with an optimal quantity of water will lead to better strength.

cement/fineagg ration while preparing the concrete should be 1:2,

we can check if there is any deviation from this usual process, also we can also 

In [None]:
data3["w_c_ratio"]= data3.water/data3.cement
data3['c_f_ratio']= data3.cement/data3.fineagg
data3.head()

In [None]:
# checking c/f ration
plt.subplot(1,2,1)
data3['w_c_ratio'].plot(kind="box")
plt.subplot(1,2,2)
data3['c_f_ratio'].plot(kind="box")
# both new features seems to have outlier where as the c/f ration has more outliers than w/c ration. 

In [None]:
plt.subplots(figsize=(12,6))
plt.subplot(1,2,1)
sns.distplot(data3["w_c_ratio"])
plt.subplot(1,2,2)
sns.distplot(data3['c_f_ratio'])
plt.show()

In [None]:
data3['c_f_ratio'].describe()
# the distribution of this feature is between 0.07 to 0.67 with a mean of 0.37
# which means on average the ration of cement to sand is 1 to 3 where as the ideal situation is 1:2 i.e. 0.50
data3['c_f_ratio'].loc[(data3['c_f_ratio']>0.50)] # 182 cases where ratio is greater than 0.50
# remaining cases is where ration is less than 0.50 wehre cement is added more than usual to improve the strength of concrete.

In [None]:
# data > original data 
# data2> Analysing the data 
# data3> all existing variables (w/o outlier)+ 2 new added features (w outlier)
# for treating outlier of the 2 newly added feature
data4 = data3.copy() 

In [None]:
# treating outlier of the new features 
q1 = data4['w_c_ratio'].quantile(0.25)
q3 = data4['w_c_ratio'].quantile(0.75)
iqr= q3-q1
out_w_c= data4['w_c_ratio'].loc[data4['w_c_ratio']>data4['w_c_ratio'].quantile(0.75)+1.5*iqr]# 16 outliers 
q1 = data4['c_f_ratio'].quantile(0.25)
q3 = data4['c_f_ratio'].quantile(0.75)
iqr= q3-q1
out_c_f= data4['c_f_ratio'].loc[data4['c_f_ratio']>data4['c_f_ratio'].quantile(0.75)+1.5*iqr] # 32 outliers 

In [None]:
# imputing with 75th percentile  
data4['c_f_ratio'].iloc[out_c_f.index] = data4['c_f_ratio'].quantile(0.75)
data4['w_c_ratio'].iloc[out_w_c.index] = data4['w_c_ratio'].quantile(0.75)

In [None]:
plt.subplot(1,2,1)
data4['w_c_ratio'].plot(kind="box")
plt.subplot(1,2,2)
data4['w_c_ratio'].plot(kind="kde")

In [None]:
plt.subplot(1,2,1)
data4['c_f_ratio'].plot(kind="box")
plt.subplot(1,2,2)
data4['c_f_ratio'].plot(kind="kde")
 

In [None]:
col_names = data4.columns
f,ax = plt.subplots(nrows=3,ncols=4,figsize =(22,12))
for i,ax,j in zip(data4.columns,ax.flatten(),col_names):
    sns.boxplot(data4[i],ax=ax)

##### checking the important components, leveraging PCA. 

In [None]:
from scipy.stats import zscore
data_pca = data4.apply(zscore)
x = data_pca.drop("strength",axis=1)
from scipy.stats import zscore
cov_mat = np.cov(x,rowvar=False)
cov_mat

In [None]:
from sklearn.decomposition import PCA
# chossing 10 principal components intially 
pca = PCA(n_components=10)
pca.fit(x)
#eigen values magnituge / length of eign vectors 
print("Eigne values:\n", pca.explained_variance_)
# eigen vectors direction
print("Eigen vectors:\n",pca.components_)

In [None]:
# percentage of variance explained by each eigen vector 
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.ylabel("cumulative variation explained")
plt.xlabel("Eigen Value")

In [None]:
print("percentage of variance explained by each eigen vector\n",pca.explained_variance_ratio_)
plt.bar(list(range(1,11)),pca.explained_variance_ratio_,alpha=0.5)
plt.ylabel("Variation explained")
plt.xlabel("Eigen Value")

In [None]:
cum_var_exp = np.cumsum(pca.explained_variance_ratio_)
pd.DataFrame(cum_var_exp,index=range(1,11),columns=['Cumulative variance explained'])

In [None]:
# from above plots its conclusive that 6 principal components explains more than 95 percent of the variation.
# proceeding with 6 components 
pca6 = PCA(n_components=6)
pca6.fit(x)
#eigen values magnituge / length of eign vectors 
print("Eigne values:\n", pca6.explained_variance_)
# eigen vectors direction
print("Eigen vectors:\n",pca6.components_)

In [None]:
# preparing data for further use during model building 
xpca6 = pca6.transform(x)
y = data_pca.strength
from sklearn.model_selection import train_test_split
xpca_train,xpca_test, ypca_train,ypca_test = train_test_split(xpca6,y,test_size=0.30,random_state=1)
print(xpca_train.shape)
print(xpca_test.shape)
print(ypca_train.shape)
print(ypca_test.shape)

#### Decision Tree Regressor

In [None]:
# Buidling a decision tree regressor to check the important features in the data set 
# Training the model on the original data set

x= data.drop("strength",axis=1)
y = data.strength
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.3, random_state=78)
model_dt_org = DecisionTreeRegressor(max_depth=5,random_state=97)
model_dt_org.fit(x_train,y_train)
print("score on train set",model_dt_org.score(x_train,y_train))
print("score on test set", model_dt_org.score(x_test,y_test))

In [None]:
feature_df = pd.DataFrame({"features": x_train.columns,
                          "Feature importance": model_dt_org.feature_importances_})
feature_df.sort_values(by="Feature importance",axis=0, ascending=False)

In [None]:
# Age cement and water are important features, lets plot the tree and check the split 
fig = plt.figure(figsize=(25,20),dpi= 300)
_=plot_tree(model_dt_org,feature_names=x_train.columns, filled=True)

In [None]:
# Building a decision tree regressor to find out feature importance on the data set with increased features 
x = data4.drop("strength", axis=1)
y = data4.strength
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state= 1)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
from sklearn.tree import DecisionTreeRegressor
model_dt = DecisionTreeRegressor(random_state= 0, max_depth=5)
model_dt.fit(x_train,y_train)
ypred_train= model_dt.predict(x_train)
ypred_test = model_dt.predict(x_test)
print("Score on train set", model_dt.score(x_train,y_train))
print("score on test set", model_dt.score(x_test,y_test))
imp_features = pd.DataFrame({"Features":x.columns,"Feature_importance":model_dt.feature_importances_})
imp_features.sort_values(by="Feature_importance",axis=0,ascending=False)

In [None]:
"""
the model has a train accuracy of 83 , its underfitting / we can say high bias.
we have a low variance in the model  as there is not a huge difference between train and test accuracy 
also after adding features and treating outliers the overall accuracy of the model is increased, however Dt is not affected 
by outliers """

In [None]:
# plotting the tree 
from sklearn.tree import plot_tree
features = x.columns
plt.subplots(nrows = 1,ncols = 1,figsize = (10, 10), dpi=300)
plot_tree(model_dt,filled=True,feature_names=features)

In [None]:
# builing the model on the reduced feature data set 
model_dt_pca = DecisionTreeRegressor(random_state=12,max_depth=5)
model_dt_pca.fit(xpca_train,ypca_train)
print("train set accuracy",model_dt_pca.score(xpca_train,ypca_train))
print("test set accuracy",model_dt_pca.score(xpca_test,ypca_test))
# the model on reduced feature is underfitting the training data and there seems to be high variance as well 

#### K means  clustering. 

In [None]:
# After the pair plot Analysis, intially expected 2-6 clusters.  
# plotting an elbow plot for 8 clusters 

from scipy.stats import zscore
data3 = data3.apply(zscore)  # scaling 
clusters =range(2,9)
wss = []
mean_distortion = []
labels = []
for c in clusters:
    model_kmeans = KMeans(n_clusters=c,init="k-means++")
    model_kmeans.fit(data3)
    wss.append(model_kmeans.inertia_)
    mean_distortion.append(sum(np.min(cdist(data3,model_kmeans.cluster_centers_,"euclidean"),axis=1))/data3.shape[0])
    labels.append(model_kmeans.labels_)
    
    
plt.subplots(figsize=(12,6))
plt.subplot(1,2,1)
plt.plot(clusters,wss,"b-o")
plt.xlabel("within cluster sum of square error" )
plt.subplot(1,2,2)
plt.plot(clusters,mean_distortion,"b-o")
plt.xlabel("mean distortion")
plt.show()

In [None]:
# from above elbow plots working with 3 clusters first 
data_c3 = data3.copy()
model_c3 = KMeans(n_clusters=3, init="k-means++")
model_c3.fit(data_c3)
prediction = model_c3.predict(data_c3)
data_c3['GROUP'] = prediction
centroids = model_c3.cluster_centers_
centroids_df = pd.DataFrame(centroids,columns=list(data3.columns))
print("Cluster centroids")
centroids_df

In [None]:
data_c3.boxplot(by="GROUP" ,figsize=[18,18]);

# Age has outliers in group 1  
# Ash has outlier in grooup 0 
# c/f ration has outliers in all groups 
# cement has outliers in group 0 
# coarseagg no outliers 
# fineagg has outliers in group 1 and 2 lower outlier  
# slag group 0 has outliers 
# strength only group 0 has outliers 
# superplastic group 0 and 2 has outliers 
# w/c ratio group 0 and 1 has outliers
# water group 0 and 1 has outliers 

In [None]:
"""
analysing variables at cluster level.
strength vs variable 
The more horizontal the line is, the more weak the independent variable is in predicting the target variable
"""

In [None]:
for i in data_c3.columns:
    sns.lmplot(x=i,y="strength",data=data_c3,hue="GROUP")

In [None]:
# 3 cluster analysis 
"""
# strength vs cement 
# G0 (blue) and G1 ( orange) seems to have a positive relation where as the G2 ( green) is somewhat flat 
# cement seems to be good predictor for overall 3 clusters 

# strength vs slag 
# All 3 groups seeem flat lines, slag does not seems to be a good predictor for all 3 groups

# strength vs Ash
# G0(blue) seems to be in positive relation where as other 2 groups have flat lines.
# ash is not a good predictor for all 3 clusters 

# strength vs water 
# G0(blue) showing a positive relation ship with strength where as other 2 as negative 
# water is only positive for 1 cluster, may not be good predictor for all clusters 

# strength vs superplastic 
# G0 and G1 ( blue and orange) shows positive relationship where as the G2( green) shows negative 
# 2 positive 1 negative , superplastic may not be a good predictor for all 3 clusters 

# strength vs coarseagg
# G0( blue) is negative and G1(orange) is slight positive, G2( green) flat line 
# coarseagg not a good predictor for all 3 clusters 

# strength vs fineagg
# G0(blue) negative relationship 
# G1(orange) and G2(green) are also flat lines
# fine agg seems a week predictor for all 3 clusters 

# strength vs Age
# all 3 clusters represents strong relationship with strength 

# strength vs w/c ration
# all 3 clusters are negatively correlated with the ration,
# seems like a strong predictor

# strength vs c/f ration
# all 3 clusters represents strong positive correlation 
"""

In [None]:
# silhouette score of above model 
from sklearn.metrics import silhouette_score
score_3 = silhouette_score(data_c3,model_c3.labels_,metric='euclidean')
score_3

In [None]:
# K-means clustering with 4 clusters 
data_c4 = data3.copy()
model_c4 = KMeans(n_clusters=4, init="k-means++")
model_c4.fit(data_c4)
prediction = model_c4.predict(data_c4)
data_c4['GROUP'] = prediction
centroids = model_c4.cluster_centers_
centroids_df_4 = pd.DataFrame(centroids,columns=list(data3.columns))
print("Cluster centroids")
centroids_df_4

In [None]:
data_c4.boxplot(by="GROUP" ,figsize=[18,18]);

# outliers can be observed in the different groups  
# Age> G0 has outliers , in 3 clusters Age had outlier in group 1 
# Ash >G2 and G3 has outliers , earlier it was G0 
# C/f ratio had outliers in all groups but in this case G0 does not have any outliers 
# cement > G0 and G2 have outliers, previously cement had in G0 
# coarseagg > except G3 all groups has outlier previously there were no outliers in any of the group 
# fineagg > earlier fine agg had outliers in all groups not in case of 4 clusters, no outliers
# slag> earlier G0 had outliers now G2 and G3 has huge outliers 
# strength >  only one group has outlier i.e. G2 earlier it was G0 
# superplastic> all 4 clusters have outliers 
# w/c ratio > only G3 has outliers earlier it was G0 and G1 
# water > earlier G0 and G1 now, G0 and G3 

In [None]:
for i in data_c4.columns:
    sns.lmplot(x=i,y="strength",data=data_c4,hue="GROUP")

In [None]:
# understanding the relationship of variables in 4 clusters 

# 4 cluster analysis 

# G0: Blue
# G1: Orange
# G2: green
# G3: red

# strength vs cement 
# G0 (blue) , G2 ( green), G3(red) seems to have a positive correlation
# G1 ( orange) seems to be flat
# cement seems to be good predictor for 3 clusters 

# strength vs slag 
# G0 ( blue) seems to be a differentiated group , 
# where as G2 and G3  are mostly similar 
# G1 is flat
# slag seems to be only good predictor for G0 group , overall a weak predictor 

# strength vs Ash
# 2 groups positive relation G0 and G2 
# 2 groups negative relation G1 and G3
# previously in 3 clusters ash was only in positive relation with 1 group and flat for other groups
# ash is not a good predictor for all 4 clusters however represents positive relation with 1 or 2 groups. 

# strength vs water 
# G0 and G1 are flat lines
# G2 is a positive relation 
# G3 is a negative relation 
# water alone seems not to be good predictor for all the clusters 

# strength vs superplastic 
# G0 seems only to be in a positive relation with lesser residual.
# remaining 3 does not seems to be good predictors 
# overall superplastic seems to be aa weak predictor for the strength of the concrete

# strength vs coarseagg
# coarseagg seems not a good predictor for all 4 clusters 

# strength vs fineagg
# fine agg seems a week predictor for all 4 clusters 

# strength vs Age
# all 4 clusters represents strong relationship with strength
# age is strong predictor of cement strength 

# strength vs w/c ration
# all 4 clusters are negatively correlated with the ration,
# seems like a strong predictor. 

# strength vs c/f ration
# all 4 clusters seems to show positive.   

In [None]:
# silhoutte score for the clusters above 
from sklearn.metrics import silhouette_score
score_4 = silhouette_score(data_c4,model_c4.labels_,metric='euclidean')
score_4

In [None]:
# 5 clusters , K-means clustering. 

In [None]:
data_c5 = data3.copy()
model_c5 = KMeans(n_clusters=5, init="k-means++")
model_c5.fit(data_c5)
prediction = model_c5.predict(data_c5)
data_c5['GROUP'] = prediction
centroids = model_c5.cluster_centers_
centroids_df_5 = pd.DataFrame(centroids,columns=list(data3.columns))
print("Cluster centroids")
centroids_df_5

In [None]:
data_c5.boxplot(by="GROUP" ,figsize=[18,18]);

In [None]:
for i in data_c5.columns:
    sns.lmplot(x=i,y="strength",data=data_c5,hue="GROUP")

In [None]:
score_5 = silhouette_score(data_c5, model_c5.labels_,metric="euclidean")
score_5

In [None]:
data_c6 = data3.copy()
model_c6 = KMeans(n_clusters=6, init="k-means++")
model_c6.fit(data_c6)
data_c6['GROUP'] = prediction
score_6 = silhouette_score(data_c6,model_c6.labels_,metric="euclidean")
score_6

#### looking at the silhoutte score and above cluster analysis, most attributes seems to be a weak predictor.
#### so far from the analysis 4 clusters look good but except cement we do not see any good predictor  

# Model building || Deliverable 3 

In [None]:
# Model building considering linear regression as starting point, basis on its performance, will try other model.
# Next will try polynomial regression algorithm with different degree of freedom.

#Linear Regression
#SVR
#Ridge Regression
#Lasso Regression#=
#Polynomial Regression
#Decision Tree
#Random Forest
#Bagging
#Ada Boost
#Gradient Boost

In [None]:
# Model1: Linear Regression 
# building on the original data set 
from sklearn.linear_model import LinearRegression, Ridge, Lasso
data_lr = data.apply(zscore)
x = data_lr.drop('strength',axis=1)
y = data_lr.strength
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state= 123, test_size=0.30)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
model_lr_1 = LinearRegression()
model_lr_1.fit(x_train,y_train)
print("score on train set", model_lr_1.score(x_train,y_train))
print("score on test set", model_lr_1.score(x_test,y_test))
print("co-efficients",model_lr_1.coef_)

In [None]:
#  model is underfitting / high bias but low variance
# fitting the model on outliers treated data and added features data 
data4=data4.apply(zscore) # scaling
x = data4.drop('strength',axis=1)
y = data4.strength
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state= 456, test_size=0.30)
model_lr_2 = LinearRegression()
model_lr_2.fit(x_train,y_train)
print("score on train set", model_lr_2.score(x_train,y_train))
print("score on test set", model_lr_2.score(x_test,y_test))
print("co-efficients",model_lr_2.coef_)

In [None]:
# after removing outliers and additng features model score is increased 
# however we still have underfitting proble/ high bias and variance is also incresed 
# checking on the reduced dimensions 
model_lr_3 = LinearRegression()
model_lr_3.fit(xpca_train,ypca_train)
print("score on train set", model_lr_3.score(xpca_train,ypca_train))
print("score on test set", model_lr_3.score(xpca_test,ypca_test))
print("co-efficients",model_lr_3.coef_)

In [None]:
# Underfitting and low variance for the reduced feature data set 

In [None]:
# Model2 : Ridge Regression 
# on the raw data 
data_rd = data.apply(zscore)
x = data_rd.drop('strength',axis=1)
y = data_rd.strength
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state= 123, test_size=0.30)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
model_rd_1 = Ridge(alpha=0.3)
model_rd_1.fit(x_train,y_train)
print("score on train set", model_rd_1.score(x_train,y_train))
print("score on test set", model_rd_1.score(x_test,y_test))
print("co-efficients",model_rd_1.coef_)

In [None]:
# fitting the model with outliers treated and features added 
x = data4.drop('strength',axis=1)
y = data4.strength
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state= 456, test_size=0.30)
model_rd_2 = Ridge(alpha=10)
model_rd_2.fit(x_train,y_train)
print("score on train set", model_rd_2.score(x_train,y_train))
print("score on test set", model_rd_2.score(x_test,y_test))
print("co-efficients",model_rd_2.coef_)

In [None]:
model_rd_3 = Ridge(alpha=0.5)
model_rd_3.fit(xpca_train,ypca_train)
print("score on train set", model_rd_3.score(xpca_train,ypca_train))
print("score on test set", model_rd_3.score(xpca_test,ypca_test))
print("co-efficients",model_rd_3.coef_)

In [None]:
# scores are similar for linear and ridge regression at alpha in the range 1 to 100
# Model3 : Lasso Regularization 
# on the raw data 
data_la = data.apply(zscore)
x = data_la.drop('strength',axis=1)
y = data_la.strength
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state= 123, test_size=0.30)
model_la_1 = Lasso(alpha=0.001)
model_la_1.fit(x_train,y_train)
print("score on train set", model_la_1.score(x_train,y_train))
print("score on test set", model_la_1.score(x_test,y_test))
print("co-efficients",model_la_1.coef_)

In [None]:
# only 3 important features, Cement Age and coarseagg
# fitting the model with outliers treated and features added 
x = data4.drop('strength',axis=1)
y = data4.strength
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state= 456, test_size=0.30)
model_la_2 = Lasso(alpha=0.01)
model_la_2.fit(x_train,y_train)
print("score on train set", model_la_2.score(x_train,y_train))
print("score on test set", model_la_2.score(x_test,y_test))
print("co-efficients",model_la_2.coef_)

In [None]:
# important features are cement, slag, water , superplastic, age, water , superplastic, and 2 new features , alpha = 0.1
# important features, all alpha = 0.01
# Model accuracy increased but still the  model is underfitting / high bias, high variance 
# testing the model on the reduced features / pCA data 

model_la_3 = Ridge(alpha=0.01)
model_la_3.fit(xpca_train,ypca_train)
print("score on train set", model_la_3.score(xpca_train,ypca_train))
print("score on test set", model_la_3.score(xpca_test,ypca_test))
print("co-efficients",model_la_3.coef_)

In [None]:
# Model4 : Polynomial regression 
from sklearn.preprocessing import PolynomialFeatures

# fitting on the original data 

data_py = data.apply(zscore)
x = data_py.drop('strength',axis=1)
y = data_py.strength
x_train, x_test, y_train_poly, y_test_poly = train_test_split(x,y,random_state= 123, test_size=0.30)
poly = PolynomialFeatures(degree=2,interaction_only=True)
x_train_poly = poly.fit_transform(x_train) 
x_test_poly = poly.fit_transform(x_test)
model_py_1 = LinearRegression()
model_py_1.fit(x_train_poly,y_train_poly)
model_py_1_scoretrain = model_py_1.score(x_train_poly,y_train_poly)
model_py_1_scoretest =  model_py_1.score(x_test_poly,y_test_poly)
print("score on train set",model_py_1_scoretrain )
print("score on test set", model_py_1_scoretest)
print("co-efficients",model_py_1.coef_)

In [None]:
# model is underfitting however very lias variance in the model.
# 37 features

# preparing on the outliers treated and added features data 

x = data4.drop('strength',axis=1)
y = data4.strength
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state= 456, test_size=0.30)
poly = PolynomialFeatures(degree=2,interaction_only=True)
x_train_poly = poly.fit_transform(x_train) 
x_test_poly = poly.fit_transform(x_test)
model_py_2 = LinearRegression()
model_py_2.fit(x_train_poly,y_train)
model_py_2_scoretrain = model_py_2.score(x_train_poly,y_train)
model_py_2_scoretest = model_py_2.score(x_test_poly,y_test)
print("score on train set", model_py_2_scoretrain)
print("score on test set", model_py_2_scoretest)
print("co-efficients",model_py_2.coef_)

In [None]:
# better fiiting but varinace is increased ,
# 56 features 
# fitting on the reduced feature data set 
xpca_train_poly = poly.fit_transform(xpca_train)
xpca_test_poly = poly.fit_transform(xpca_test)
model_py_3 = LinearRegression()
model_py_3.fit(xpca_train_poly,ypca_train)
model_py_3_scoretrain =  model_py_3.score(xpca_train_poly,ypca_train)
model_py_3_scoretest = model_py_3.score(xpca_test_poly,ypca_test)
print("score on train set",model_py_3_scoretrain)
print("score on test set", model_py_3_scoretest)
print("co-efficients",model_py_3.coef_)

In [None]:
# no better results on the reduced features data set 

# Model5 : SVR support vector regressor 
# applying on the original data 
from sklearn.svm import SVR
data_svr = data.apply(zscore)
x = data_svr.drop('strength',axis=1)
y = data_svr.strength
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state= 123, test_size=0.30)
model_svr_1 = SVR()
model_svr_1.fit(x_train,y_train)
print("score on train set", model_svr_1.score(x_train,y_train))
print("score on test set", model_svr_1.score(x_test,y_test))

In [None]:
# so far best score among all the models, data is fitting better but still underfit , high variance 
# checking on the outlier treated data 
x = data4.drop('strength',axis=1)
y = data4.strength
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state= 456, test_size=0.30)
model_svr_2 = SVR()
model_svr_2.fit(x_train,y_train)
print("score on train set", model_svr_2.score(x_train,y_train))
print("score on test set", model_svr_2.score(x_test,y_test))

In [None]:
# better results , data seems to fit better than as compared to original data, however there is high variance 
# high variance resvrtes to more complexity, however we have just built a basic SVM regressor 

# testing on the reduced feature data set 
model_svr_3 = SVR()
model_svr_3.fit(xpca_train,ypca_train)
print("score on train set", model_svr_3.score(xpca_train,ypca_train))
print("score on test set", model_svr_3.score(xpca_test,ypca_test))

In [None]:
# lesser variance as compared to previous model 
# Model6 : Randomforest regressor 

from sklearn.ensemble import RandomForestRegressor
data_rf = data.apply(zscore)
x = data_rf.drop('strength',axis=1)
y = data_rf.strength
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state= 123, test_size=0.30)
model_rf_1 = RandomForestRegressor(random_state=25,n_estimators=50)
model_rf_1.fit(x_train,y_train)
print("score on train set", model_rf_1.score(x_train,y_train))
print("score on test set", model_rf_1.score(x_test,y_test))

In [None]:
# low bais high variance zone.
# fitting on the created data set with features and removed outliers
x = data4.drop('strength',axis=1)
y = data4.strength
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state= 456, test_size=0.30)
model_rf_2 = RandomForestRegressor(random_state=12,n_estimators=50)
model_rf_2.fit(x_train,y_train)
print("score on train set", model_rf_2.score(x_train,y_train))
print("score on test set", model_rf_2.score(x_test,y_test))

In [None]:
# no change in performance as compared to previoys 
# testing on the reduced feature data set 

model_rf_3 = RandomForestRegressor(n_estimators=50,random_state=78)
model_rf_3.fit(xpca_train,ypca_train)
print("score on train set", model_rf_3.score(xpca_train,ypca_train))
print("score on test set", model_rf_3.score(xpca_test,ypca_test))

In [None]:
# again low bais but high variance in the model 
# Model 7 : Bagging Regressor 

# on Raw data 
from sklearn.ensemble import BaggingRegressor
data_bgr = data.apply(zscore)
x = data_bgr.drop('strength',axis=1)
y = data_bgr.strength
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state= 123, test_size=0.30)
model_bgr_1 = BaggingRegressor(random_state=95,n_estimators=100)
model_bgr_1.fit(x_train,y_train)
print("score on train set", model_bgr_1.score(x_train,y_train))
print("score on test set", model_bgr_1.score(x_test,y_test))

In [None]:
#low bais , high variance 
# fitiing on featured data 
data4 = data4.apply(zscore)
x = data4.drop('strength',axis=1)
y = data4.strength
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state= 748, test_size=0.30)
model_bgr_2 = BaggingRegressor(random_state=4,n_estimators=50)
model_bgr_2.fit(x_train,y_train)
print("score on train set", model_bgr_2.score(x_train,y_train))
print("score on test set", model_bgr_2.score(x_test,y_test))

In [None]:
# outlier treatment have no random forest regressor and bagging regressor 
# on the reduced feature data set 
model_bgr_3 = BaggingRegressor(n_estimators=100,random_state=75)
model_bgr_3.fit(xpca_train,ypca_train)
print("score on train set", model_bgr_3.score(xpca_train,ypca_train))
print("score on test set", model_bgr_3.score(xpca_test,ypca_test))

In [None]:
# reduced feature data has very high variance 
# Model 8 : Adaboost Regressor.
# on the raw data, 
from sklearn.ensemble import AdaBoostRegressor
data_ada = data.apply(zscore)
x = data_ada.drop('strength',axis=1)
y = data_ada.strength
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state= 123, test_size=0.30)
model_ada_1 = AdaBoostRegressor(random_state=96,n_estimators=100)
model_ada_1.fit(x_train,y_train)
print("score on train set", model_ada_1.score(x_train,y_train))
print("score on test set", model_ada_1.score(x_test,y_test))

In [None]:
# high bais but low variance 
# checking on the featured data 
data4 = data4.apply(zscore)
x = data4.drop('strength',axis=1)
y = data4.strength
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state= 748, test_size=0.30)
model_ada_2 = AdaBoostRegressor(random_state=35,n_estimators=100)
model_ada_2.fit(x_train,y_train)
print("score on train set", model_ada_2.score(x_train,y_train))
print("score on test set", model_ada_2.score(x_test,y_test))

In [None]:
# high bias , variance reduced as compared to previous model 
# on the reduced feature data set 

model_ada_3 = AdaBoostRegressor(n_estimators=50,random_state=47)
model_ada_3.fit(xpca_train,ypca_train)
print("score on train set", model_ada_3.score(xpca_train,ypca_train))
print("score on test set", model_ada_3.score(xpca_test,ypca_test))

In [None]:
# no better performance on reduced featured data set 
# Model 9 : Gradient Boosting Regressor 

# on the raw data 
from sklearn.ensemble import GradientBoostingRegressor
data_gbm = data.apply(zscore)
x = data_gbm.drop('strength',axis=1)
y = data_gbm.strength
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state= 123, test_size=0.30)
model_gbm_1 =GradientBoostingRegressor(random_state=75,n_estimators=50)
model_gbm_1.fit(x_train,y_train)
print("score on train set", model_gbm_1.score(x_train,y_train))
print("score on test set", model_gbm_1.score(x_test,y_test))

In [None]:
# best scores so far, low bias and a low variance and a better score on the test set 

# checking on the added feature data set 
data4 = data4.apply(zscore)
x = data4.drop('strength',axis=1)
y = data4.strength
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state= 748, test_size=0.30)
model_gbm_2 = GradientBoostingRegressor(random_state=53,n_estimators=100)
model_gbm_2.fit(x_train,y_train)
print("score on train set", model_gbm_2.score(x_train,y_train))
print("score on test set", model_gbm_2.score(x_test,y_test))

In [None]:
# we can tune this model further . 

# checking on the reduced feature data set.
model_gbm_3 = GradientBoostingRegressor(n_estimators=100,random_state=147)
model_gbm_3.fit(xpca_train,ypca_train)
print("score on train set", model_gbm_3.score(xpca_train,ypca_train))
print("score on test set", model_gbm_3.score(xpca_test,ypca_test))

In [None]:
# no better performance on the reduced feature data set 
# comparing the performance of all the built models and than further evaluating to hypertune them 
scaled_data = data.apply(zscore)
x1= scaled_data.drop("strength",axis=1)
y1= scaled_data.strength
x1_train,x1_test,y1_train,y1_test = train_test_split(x1,y1,random_state=1)
data4= data4.apply(zscore)
x2= data4.drop("strength",axis=1)
y2= data4.strength
x2_train,x2_test,y2_train,y2_test = train_test_split(x2,y2,random_state=2)

In [None]:
models = ["Linear Regression","Ridge Regression","Lasso Regression","Polynomial Regression","SVR","Randomfoest",
         "Bagging Regressor","adaboost Regressor","Gradientboost regressor"]
a_train = {"Train score rawdata": [model_lr_1.score(x1_train,y1_train),model_rd_1.score(x1_train,y1_train), 
                                            model_la_1.score(x1_train,y1_train), model_py_1_scoretrain,
                                           model_svr_1.score(x1_train,y1_train), model_rf_1.score(x1_train,y1_train),
                                           model_bgr_1.score(x1_train,y1_train), model_ada_1.score(x1_train,y1_train),
                                           model_gbm_1.score(x1_train,y1_train)],
            "Test score rawdata":[model_lr_1.score(x1_test,y1_test),model_rd_1.score(x1_test,y1_test),
                                         model_la_1.score(x1_test,y1_test), model_py_1_scoretest,
                                         model_svr_1.score(x1_test,y1_test), model_rf_1.score(x1_test,y1_test),
                                         model_bgr_1.score(x1_test,y1_test), model_ada_1.score(x1_test,y1_test),
                                         model_gbm_1.score(x1_test,y1_test)],
            "Train score featuredata":[model_lr_2.score(x2_train,y2_train),model_rd_2.score(x2_train,y2_train),
                                              model_la_2.score(x2_train,y2_train), model_py_2_scoretrain,
                                              model_svr_2.score(x2_train,y2_train), model_rf_2.score(x2_train,y2_train),
                                              model_bgr_2.score(x2_train,y2_train), model_ada_2.score(x2_train,y2_train),
                                              model_gbm_2.score(x2_train,y2_train)],
            "Test score featuredata":[model_lr_2.score(x2_test,y2_test),model_rd_2.score(x2_test,y2_test),
                                              model_la_2.score(x2_test,y2_test), model_py_2_scoretest,
                                              model_svr_2.score(x2_test,y2_test), model_rf_2.score(x2_test,y2_test),
                                              model_bgr_2.score(x2_test,y2_test), model_ada_2.score(x2_test,y2_test),
                                              model_gbm_2.score(x2_test,y2_test)],
            "Train score PCA data":[model_lr_3.score(xpca_train,ypca_train),model_rd_3.score(xpca_train,ypca_train),
                                     model_la_3.score(xpca_train,ypca_train), model_py_3_scoretrain,
                                     model_svr_3.score(xpca_train,ypca_train), model_rf_3.score(xpca_train,ypca_train),
                                     model_bgr_3.score(xpca_train,ypca_train),model_ada_3.score(xpca_train,ypca_train),
                                     model_gbm_3.score(xpca_train,ypca_train)],
            'Test score PAC data':[model_lr_3.score(xpca_test,ypca_test),model_rd_3.score(xpca_test,ypca_test),
                                    model_la_3.score(xpca_test,ypca_test),model_py_3_scoretest,
                                    model_svr_3.score(xpca_test,ypca_test),model_rf_3.score(xpca_test,ypca_test),
                                    model_bgr_3.score(xpca_test,ypca_test), model_ada_3.score(xpca_test,ypca_test),
                                    model_gbm_3.score(xpca_test,ypca_test)]}


In [None]:
compare_df=pd.DataFrame(a_train,index=models)
compare_df

In [None]:
# so far we observe the GBM as the best performing model on the outlier treated data with added features 

# Hypertuning || Deliverable 4

In [None]:
# using Gridsearch CV to find the best parameter for GBR
from sklearn.model_selection import GridSearchCV
estimator=GradientBoostingRegressor()
grid ={'n_estimators':[100,200,300,400,500,600],
       'learning_rate':[.001,0.01,.1],
       'max_depth':[1,2,3,4,5],
        'subsample':[.5,.75,1],
       'random_state':[1]}
search=GridSearchCV(estimator=estimator,param_grid=grid,scoring='neg_mean_squared_error',n_jobs=1,cv=10)

In [None]:
# best parameter for GBR on raw data 
search.fit(x1_train,y1_train)
search.best_params_

In [None]:
# building the GBM model with the above parameters 
gbm_1 = GradientBoostingRegressor(learning_rate=0.1,max_depth=5,n_estimators=400,random_state=1,subsample=1)
gbm_1.fit(x1_train,y1_train)
print("score on the train set", gbm_1.score(x1_train,y1_train))
print("score on the test set",gbm_1.score(x1_test,y1_test))

In [None]:
# score comparison after model tuning
# score is increased from 91 on train to 99 on train, 
# score is increased from 87 on test to 92 on test 

In [None]:
# best parameter for GBM on outliers treated and feature data 
search.fit(x2_train,y2_train)
search.best_params_

In [None]:
gbm_2 = GradientBoostingRegressor(learning_rate=0.1,max_depth=5,n_estimators=200,random_state=1,subsample=1)
gbm_2.fit(x2_train,y2_train)
print("score on the train set", gbm_2.score(x2_train,y2_train))
print("score on the test set",gbm_2.score(x2_test,y2_test))

In [None]:
# score comparison after model tuning
# score is increased from 94 on train to 99 on train, 
# score is increased from 90 on test to 92 on test 

In [None]:
# model performance range at 95% confidemce 
# Leveraging cross validation to estimate the performance on the unseen data 

from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(gbm_2,x2,y2,cv=10)
cv_scores_mean = cv_scores.mean()
cv_scores_std = cv_scores.std()
print("cross validation scores", cv_scores)
print(" Accuracy : %.3f%%(%.3f%%)"% (cv_scores_mean*100, cv_scores_std*100))

## The accuracy of the Gradient Boost model 

#### ---------------------------------------------------------------------------------------------------------------------------------
#### in the production environment is expected to be 92.84% (+-) standard deviation (2.805%)
#### For 95% confidence level  the model accuracy in the production environment is
#### expected to be in the range of 92.84% (+-) 2 * standard deviation i.e. [87.23, 98.45] 
#### ---------------------------------------------------------------------------------------------------------------------------------