In [None]:
import pandas as pd
import numpy as pi
import seaborn as sn

import matplotlib.pyplot as plt
%matplotlib inline


from sklearn import metrics
from sklearn.model_selection import train_test_split
from scipy.stats import zscore
from scipy.stats import randint as sp_randint

from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans 

# Preprocessing libraries
from sklearn.impute import SimpleImputer
from sklearn import preprocessing

# Models
from sklearn import svm
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from scipy.stats import randint as sp_randint
from sklearn.model_selection import RandomizedSearchCV

## Dataset Description and the probelm space

- The dataset is for the compresive strength of the cement with below parameters

    - Cement
    - Blast
    - Fly Ash
    - Water
    - Super Plasticizer
    - Coarse Aggregate
    - Fine Aggregate
    - Age
    - Concrete Compressive Strength

In [None]:
# Setting the display layout for the coding environment

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

sn.set(font_scale=1.0)

## Dataset analysis for Cemenet Concrete

- Loading the dataset and checking out the dimensions, shape and the type of the parameters
- Analysing the each attributes for the anomilies

In [None]:
import os

print('Listing files in the folder')
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
concrete_data = pd.read_csv("/kaggle/input/regression-with-neural-networking/concrete_data.csv")

concrete_data.columns=['cement','slag','ash','water','superplastic','coarseagg','fineagg','age','strength']

print("\nDimensions of the data")

print("Shape of the data :{0}".format(concrete_data.shape))
print("Size of the data :{0}".format(concrete_data.size))
print("nDim of the data :{0}".format(concrete_data.ndim))
print("Shape x * y :{0}".format(concrete_data.shape[0]*concrete_data.shape[1]))

print("\nData types of the Data")
print(concrete_data.info())

print("\nData elements from file")
concrete_data.head(10)

## Checking for the missing or the null values

- The numeric columns does not have invalid characters only the NaN 
- Checking the number of 0's in the data that could also be a null value

In [None]:
# Identify the types and the missing values of the data

print("\nList of Null values in each column\n")
print(concrete_data.isna().sum())

In [None]:
# Dataset does not have any null values, but need to verify if 0.0 is acutally replaced null value

concrete_columns =  concrete_data.columns

print('\nZeros in each of the attribute\n')
for col in concrete_columns:
    print(f"Total 0's in Column[{col}] : {(concrete_data[col]==0).sum()}")


### Summary of the Dataset Analysis

- total 9 attributes, the data types are already numeric. Hence no conversion of datatypes was required
- All the attributes are linear in nature, there are good number of data values for each attributes
- There are no null values in any of the attributes, hence null value treatment not required
- All the attribute are relevant, no need to drop any of the attributes
- **'strength'** is the Target variable, will do detailed analysis on that attribute later


#### Complexties in the data set

- Values of the attributes are in different units, hence needs to be converted to the common type by preprocessors
- Some of the attributes have zero values, it might be a default value hence leaving it un touched for now
- There could be some outliers in the data, have to do some outlier processing

In [None]:
# define functions to identify the number of outliers

def Identify_Outliers(data_column):
    
    dataFrame = pd.DataFrame(data_column)
    
    Quar1 = dataFrame.quantile(0.25)  
    Quar3 = dataFrame.quantile(0.75)  
    
    IQR = Quar3 - Quar1
    
    return ((dataFrame < (Quar1-1.5*IQR)) | (dataFrame> (Quar3+1.5*IQR))).sum()
    

print("\n Number of outliers in each attribute \n")
for (columnName, columnData) in concrete_data.iteritems(): 
    print(Identify_Outliers(concrete_data[columnName]))

data_columns = concrete_data.columns

In [None]:
# Visualising the outliers

fig, axis = plt.subplots(2, 3, figsize=(25, 12), sharex=False)

axis[0,0].set_title("Slag")
sn.boxplot(concrete_data["slag"],color='green',orient='h',ax=axis[0,0]);

axis[0,1].set_title("water")
sn.boxplot(concrete_data["water"],color='green',orient='h',ax=axis[0,1])

axis[1,0].set_title("superplastic")
sn.boxplot(concrete_data["superplastic"],color='green',orient='h',ax=axis[0,2])

axis[1,1].set_title("fineagg")
sn.boxplot(concrete_data["fineagg"],color='orange',orient='h',ax=axis[1,0])

axis[1,1].set_title("age")
sn.boxplot(concrete_data["age"],color='orange',orient='h',ax=axis[1,1])

axis[1,1].set_title("ash")
sn.boxplot(concrete_data["ash"],color='orange',orient='h',ax=axis[1,2])

plt.show()

### Summary of the Outliers Analysis

- Included the 'ash' attribute for the comparison of non-outliers
- The oultiers on these datasets are nominal as compared to the number of data
- The 'age' column has the higher number of outlier, more detailed analysis required before correcting the outliers
- Modifiying the outliers will impact the data quality

In [None]:
# Do a scatter plot to understand the outliers in detail with the target variable

fig, axis = plt.subplots(1, 4, figsize=(25, 8), sharex=False)

axis[0].set_title("Super Plastic")
sn.scatterplot(x='strength' ,y='superplastic',data=concrete_data,ax=axis[0]);

axis[1].set_title("Water")
sn.scatterplot(x='strength' ,y='water',data=concrete_data,ax=axis[1]);

axis[2].set_title("Age")
sn.scatterplot(x='strength' ,y='age',data=concrete_data,ax=axis[2]);

axis[2].set_title("Fly Ash")
sn.scatterplot(x='strength' ,y='ash',data=concrete_data, palette=['orange'],  ax=axis[3]);

plt.show()

### Summary of the Outlier Analysis

- The outliers for the attributes **'Slag' , 'Fine Aggergate'** looks to be nominal hence not altering those attributes
- Done a scatter plot to look into the attribute which has the higher number of outliers **'Super Plastic' , 'Water' , 'Age'**
- The outliers for the **'Water'** looks to be more even a little of the values in lower and uper whiskers, leaving this attribue intact
- The oultiers for the **'Super Plastic'** is little spread across, the impact could be mainly of 0 values. We will experiment modifying the 0's
- The **'ash'** attribute looks to be similar to the **'Super Plastic'** the difference is the upper whisker spread little lower 
- The outlier treatment needed for the **'age'** attribute as it looks to be more spread across, treating the upper value alone would be better


In [None]:
# treating the outliers of the 'age' and 'superplastic'

age_IQR = concrete_data["age"].quantile(.75)-concrete_data["age"].quantile(.25)
age_upperWhisker = concrete_data["age"].quantile(.75) + (age_IQR*1.5)

print(f'\n Qunatile Q3(75): {age_upperWhisker}')

concrete_outlierremoved_data = concrete_data.copy()

concrete_outlierremoved_data['superplastic'] = concrete_outlierremoved_data['superplastic'].replace({0:concrete_outlierremoved_data['superplastic'].median()})
concrete_outlierremoved_data['superplastic'] = pi.where(concrete_outlierremoved_data['superplastic']>20,20,concrete_outlierremoved_data['superplastic'])

concrete_outlierremoved_data['age'] = pi.where(concrete_outlierremoved_data['age']>age_upperWhisker,age_upperWhisker,concrete_outlierremoved_data['age'])

# Do a histogram to see the difference

fig, axis = plt.subplots(2, 2, figsize=(25, 8), sharex=False)

axis[0,0].set_title('Super Plastic')
sn.distplot(concrete_data['superplastic'],color='blue',ax=axis[0,0]);

axis[0,1].set_title("Super Platic")
sn.distplot(concrete_outlierremoved_data['superplastic'],color='blue',ax=axis[0,1]);

axis[0,0].set_title('age')
sn.distplot(concrete_data['age'],color='orange',ax=axis[1,0]);

axis[0,1].set_title("age")
sn.distplot(concrete_outlierremoved_data['age'],color='orange',ax=axis[1,1]);



### Summary of the outlier treatment

- Removing of the outliers on the **'superplastic'** seems to normalise the data and reduce the skew
- Treating the outliers to upper whisker has reduced the skewness of the data
- Need to do univarite analysis for the other attribute before finalising on the outlier treatment

In [None]:
print('\nFive point summary for the attributes')
concrete_data.describe()

In [None]:
print('Shape of Data: ', concrete_data.shape)

print('\nMedian for the data')
print(concrete_data.median())

print('\nMode for the data')
print(concrete_data.mode())

In [None]:
print('\nSkewing of the data\n')

for col_name in concrete_columns:
    print('{0} Parameter is Right Skewed:   {1}'.format(col_name,concrete_data[col_name].mean() > concrete_data[col_name].median()))
    
print('\n{0} Parameter is Left Skewed: {1}'.format('water',concrete_data['water'].mean() < concrete_data['water'].median()))
print('{0} Parameter is Left Skewed: {1}'.format('superplastic',concrete_data['superplastic'].mean() < concrete_data['superplastic'].median()))
print('{0} Parameter is Left Skewed: {1}'.format('fineagg',concrete_data['fineagg'].mean() < concrete_data['fineagg'].median()))


### Summary of the Five point analysis

- The difference between the median and the max values are greater for the attributes **'cement' , 'slag' , 'ash' , 'age'**
- The above differene causes the skewness of the attributes, most of these attribute are rightly skewed
- Some of the attributes have 0 as modes as the 0's are more in these attribute

In [None]:
def Display_DistPlot(col,axis_rad,color):
    axis_rad.set_title(col)
    sn.distplot(concrete_data[col],color=color,ax=axis_rad);

sn.set(font_scale=1.5)

fig, axis = plt.subplots(2, 4, figsize=(30, 15), sharex=False)

Display_DistPlot('cement',axis[0,0],'blue')
Display_DistPlot('slag',axis[0,1],'blue')
Display_DistPlot('ash',axis[0,2],'blue')
Display_DistPlot('water',axis[0,3],'blue')

Display_DistPlot('superplastic',axis[1,0],'orange')
Display_DistPlot('coarseagg',axis[1,1],'orange')
Display_DistPlot('fineagg',axis[1,2],'orange')
Display_DistPlot('age',axis[1,3],'orange')

plt.show()

### Summary of the Univariate Analysis

- The attributes **'cement , 'Coarse Agg' , 'Fine Agg'** looks to be more normalise with little dip in the curve
- The **'slag' , 'ash' , 'superplastic'** has additional gaussians due the 0's
- The **'age'** attribute has multiple gaussians and its rightly skewed

## Correlation or BiVariate Analysis

- The correlaation matrix with heat map gives the visualisation of the atrribute relation
- The threshold valued heat map gives the better clarity on the relations
- The pairplot shows the detailed view of the relation between each of the attributes
- Followed by swarm or violin plot will share some details on the selecte attribute impacts

In [None]:
# Identify the Correlation between the variables

fig, axis = plt.subplots(2, 1, figsize=(30, 20), sharex=False)

sn.set(font_scale=1.5)

sn.heatmap(concrete_data.corr(), mask=pi.triu(concrete_data.corr()),annot_kws={"size": 14}, annot=True,fmt='.3f',ax=axis[0]);

corr_thresold=0.1
concrete_corr_threshold=concrete_data.corr()>corr_thresold

sn.heatmap(concrete_corr_threshold,mask=pi.triu(concrete_data.corr()), annot_kws={"size": 16},annot=True,fmt='d',ax=axis[1]);

plt.show()

In [None]:

graph=sn.pairplot(concrete_data,kind='scatter',x_vars=concrete_columns,y_vars=concrete_columns,diag_kind='kde');

graph.fig.set_size_inches(35,35)

### Summary of the Analysis

- **'Cement'** and **'Strength'** shows a postivie correlation with the data scattered on the right
- **'slag' , 'ash' , 'water' , 'superplastic' , 'coarseagg' , 'fineagg'** have multiple gussians, will do PCA analysis to form clusters
- **'age'** shows a quite a number of gaussians
- **'strength'** correlation with the different attribute is more scattered, which suggest that any of the feature need not to be removed
- **'water'** and **'superplastic'** shows a negative correlation, need a study with the swarm plot

In [None]:
fig, axis = plt.subplots(2, 3, figsize=(30, 20), sharex=False)

sn.scatterplot(data=concrete_data,x='water',y='superplastic',hue='strength', ax=axis[0,0]);
sn.scatterplot(data=concrete_data,x='water',y='fineagg',hue='strength',ax=axis[0,1]);
sn.scatterplot(data=concrete_data,x='age',y='superplastic',hue='strength',ax=axis[0,2]);

sn.scatterplot(data=concrete_data,x='cement',y='superplastic',hue='strength',ax=axis[1,0]);
sn.scatterplot(data=concrete_data,x='cement',y='coarseagg',hue='strength',ax=axis[1,1]);
sn.scatterplot(data=concrete_data,x='cement',y='fineagg',hue='strength',ax=axis[1,2]);

plt.show()

### Summary of the Analysis

- As discussed earlier 'water' and superplastic have negative correlation, moreover more the water lesser the superplastic and strength
- 'cement' and 'coarseagg' has interesting correlation higher cement and mid less to higher the coarse the stronger it gets, even with higher coarseagg with lesser cement the strength is always lower
- The age is little strange, higher age value also shows lower superplastic and strength



In [None]:
# Identifying the minimum values for the attributes

print('\nUniques values of the "slag"\n')
print(pi.sort(concrete_data['slag'].unique()))

print('\nUniques values of the "ash"\n')
print(pi.sort(concrete_data['ash'].unique()))

print('\nUniques values of the "superplastic"\n')
print(pi.sort(concrete_data['superplastic'].unique()))


## Preparing the data set for the analysis

- Making the values to the common units for the analysis
- Not removing any of the features or attribues as it all looks good
- There are 0 values in the attribue hence imputing it to the median values
- The outliers does not seem to have impact hance leaving the outliers as it is for analysis

In [None]:
# Using the simple imputer to change the 0's to median

imputer = SimpleImputer(missing_values=0,strategy='median')
imputer.fit(concrete_data)

concrete_mod_data = pd.DataFrame(imputer.transform(concrete_data))
concrete_mod_data.columns=concrete_data.columns

print('\nZeros in each of the attribute\n')
for col in concrete_columns:
    print(f"Total 0's in Column[{col}] : {(concrete_mod_data[col]==0).sum()}")


## PCA Analysis - Composite Feature and Gaussian

- Using PCA for the composite feature and dimensionality reduction
- The PCA cluster as well helpful in testing the accuracy of the model
- The cluster could be also analysed for any gussians as well as the prediction strength

In [None]:
column_names = concrete_mod_data.columns

# pre-processing the split data using the preprocesser technique, standard processor
scaler_model = preprocessing.StandardScaler()
conc_standardscaled_array= scaler_model.fit_transform(concrete_mod_data)

conc_standardscaled = pd.DataFrame(conc_standardscaled_array,columns=column_names)

conc_stdscaled_x = conc_standardscaled.drop(['strength'],axis=1)
conc_stdscaled_y = conc_standardscaled['strength']

print(f"\nScales Axis Shape: {conc_stdscaled_x.shape}")
conc_stdscaled_x.head()

In [None]:
from sklearn.decomposition import PCA

components_selected=8
pca = PCA(n_components=components_selected)
pca.fit(conc_stdscaled_x)

print(f"\n The explained variance based on the {components_selected} components\n")
print(pca.explained_variance_ratio_)


cumm_var_ratio = pd.Series(pi.cumsum(pca.explained_variance_ratio_))
print(f"\nThe cummulative variance ratio base on the {components_selected} components\n")
print(cumm_var_ratio)


plt.bar(list(range(1,9)),pca.explained_variance_ratio_,alpha=1, align='center')
plt.ylabel('Variation explained')
plt.xlabel('eigen Value')
plt.show()

plt.step(list(range(1,9)),pi.cumsum(pca.explained_variance_ratio_), where='mid')
plt.ylabel('Cum of variation explained')
plt.xlabel('eigen Value')
plt.show()


In [None]:
components_selected=7
pca = PCA(n_components=components_selected)
pca.fit(conc_stdscaled_x)

print(f"\n The explained variance based on the {components_selected} components\n")
print(pca.explained_variance_ratio_)


cumm_var_ratio = pd.Series(pi.cumsum(pca.explained_variance_ratio_))
print(f"\nThe cummulative variance ratio base on the {components_selected} components\n")
print(cumm_var_ratio)


conc_transformed_x = pd.DataFrame(pca.transform(conc_stdscaled_x))
conc_transformed_x.head()

In [None]:
graph=sn.pairplot(conc_transformed_x,kind='scatter',diag_kind='kde');

graph.fig.set_size_inches(35,35)

### Summary of the analysis

- All the transformed clusters are independent does not show any correlation
- Identified 7 cluster features for the model prediction

## Identifying the features for the model through the coeff
- the pairplot visualisation helped in analysing the features
- In order to identify the feature importance, using the Coeff function and filtering features based on the values

In [None]:
concrete_mod_data.head()

In [None]:
conc_x=concrete_mod_data.drop(['strength'],axis=1)
conc_y=concrete_mod_data['strength']

x_train_feature,x_test_feature,y_train_feature,y_test_feature = train_test_split(conc_x,conc_y,test_size=0.3,random_state=80)

svm_model = svm.SVR(kernel='linear')
svm_model.fit(x_train_feature,y_train_feature)

ceof_data=pd.concat([pd.Series(svm_model.coef_[0]),pd.Series(x_train_feature.columns)],axis=1)

print("\n Feature coeff\n")
print(ceof_data.sort_values(by=[0]))

### Summary of the Feature selection

- All features are showing good feature coeff, hence not removing any of the features

## Preparing the data for the Model

- Normal approach to split it into 70:30, the same will be following
- Model will be trained with regular as well as the PCA cluster data for accuracy

In [None]:

# Sclaing the source data
conc_zscaled=concrete_mod_data.apply(zscore)

# Standard data
conc_zscaled_x=conc_zscaled.drop(['strength'],axis=1)
conc_zscaled_y=conc_zscaled['strength']

conc_zscaled_x.head()


In [None]:
# PCA transformed data 

conc_stdscaled_y
conc_transformed_x.head()

In [None]:
# spliting the different data

x_train_zscaled,x_test_zscaled,y_train_zscaled,y_test_zscaled = train_test_split(conc_zscaled_x,conc_zscaled_y,test_size=0.3,random_state=160)


# PCA Cluster data split

x_train_pca,x_test_pca,y_train_pca,y_test_pca = train_test_split(conc_transformed_x,conc_stdscaled_y,test_size=0.3,random_state=160)


## Algorithms - SVM & Decession Tree Regressor Models

- SVM and the Decession Tree looks to be best models for this data set

In [None]:

svm_model = svm.SVR(gamma=.7,C=10)
svm_model.fit(x_train_zscaled,y_train_zscaled)

svm_predict_train = svm_model.predict(x_train_zscaled)
svm_predict_test = svm_model.predict(x_test_zscaled)

svm_train_accuracy = metrics.r2_score(y_train_zscaled,svm_predict_train)
svm_test_accuracy = metrics.r2_score(y_test_zscaled,svm_predict_test)

print("\nAccuracy of the SVM Model\n")
print(f"Train Accuracy: {svm_train_accuracy}")
print(f"Test  Accuracy: {svm_test_accuracy}")

In [None]:

svm_model = svm.SVR(gamma=.6,C=10)
svm_model.fit(x_train_pca,y_train_pca)

svm_predict_train_pca = svm_model.predict(x_train_pca)
svm_predict_test_pca = svm_model.predict(x_test_pca)

svm_train_accuracy_pca = metrics.r2_score(y_train_pca,svm_predict_train_pca)
svm_test_accuracy_pca = metrics.r2_score(y_test_pca,svm_predict_test_pca)

print("\nAccuracy of the SVM Model\n")
print(f"Train Accuracy: {svm_train_accuracy_pca}")
print(f"Test  Accuracy: {svm_test_accuracy_pca}")

In [None]:
svm_model = svm.SVR(gamma=.5,C=10)
kfoldcross = KFold(n_splits=30, random_state=190,shuffle=True)
score = cross_val_score(svm_model, conc_zscaled_x, conc_zscaled_y, cv=kfoldcross)

kfold_score=score.mean()
print("Kfold Crossvalidation score")
print(f"Accuracy: {kfold_score}")

### Summary of the Model Tuning for SVM

- Tuned the C and Alpha values for an accuracy of 79% in the test data
- Extra performance Squezee through the Kfold yielded 87.9% accuracy

In [None]:
dectree_model = DecisionTreeRegressor(max_depth=14)
dectree_model.fit(x_train_zscaled,y_train_zscaled)

dectree_predict_train = dectree_model.predict(x_train_zscaled)
dectree_predict_test = dectree_model.predict(x_test_zscaled)

dectree_train_accuracy = metrics.r2_score(y_train_zscaled,dectree_predict_train)
dectree_test_accuracy = metrics.r2_score(y_test_zscaled,dectree_predict_test)

print("\nAccuracy of the Decission Tree Model\n")
print(f"Train Accuracy: {dectree_train_accuracy}")
print(f"Test  Accuracy: {dectree_test_accuracy}")

In [None]:
dectree_model = DecisionTreeRegressor(max_depth=22)
dectree_model.fit(x_train_pca,y_train_pca)

dectree_predict_train_pca = dectree_model.predict(x_train_pca)
dectree_predict_test_pca = dectree_model.predict(x_test_pca)

dectree_train_accuracy_pca = metrics.r2_score(y_train_pca,dectree_predict_train_pca)
dectree_test_accuracy_pca = metrics.r2_score(y_test_pca,dectree_predict_test_pca)

print("\nAccuracy of the Decission Tree Model\n")
print(f"Train Accuracy: {dectree_train_accuracy_pca}")
print(f"Test  Accuracy: {dectree_test_accuracy_pca}")

In [None]:
dectree_model = DecisionTreeRegressor(max_depth=50)
kfoldcross = KFold(n_splits=40, random_state=190,shuffle=True)
score = cross_val_score(dectree_model, conc_zscaled_x, conc_zscaled_y, cv=kfoldcross)

kfold_score=score.mean()
print("Kfold Crossvalidation score")
print(f"Accuracy: {kfold_score}")

In [None]:

param_dist = {"max_depth": range(20,50),
              "max_features": sp_randint(7, 9),
              "min_samples_split": range(5, 20),
              "min_samples_leaf": range(5, 20),
              "criterion": ["mse", "mae"]}

randomCV = RandomizedSearchCV(dectree_model, param_distributions=param_dist, n_iter=50) #default cv = 3

#print(dectree_model.get_params().keys())

randomCV.fit(conc_zscaled_x, conc_zscaled_y)
print(randomCV.best_params_)

print(pi.sort(randomCV.cv_results_['mean_test_score']))

### Summary of the Model Tuning for Decission Tree

- Tuned the diffent parameters of the Decission tree the accuracy of 80% in the test data
- Extra performance Squezee through the Kfold yielded 85% accuracy

## Final Summary

- The SVM Model was tuned using the Kfold to get an accuracy of 87.9% inspite of initial 79%
- The Decission tree was tuned using the kfold to get an accuracy of 85% inspite of initial 80%
- The Random Search CV was not able to push the model for greater accuracy, it only yielded 80% max