In [229]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
import itertools 
%matplotlib inline 

from sklearn import preprocessing 
from sklearn.preprocessing import StandardScaler, PolynomialFeatures 
from sklearn.model_selection import train_test_split, KFold, cross_val_score

from sklearn.linear_model import LinearRegression, Ridge, Lasso 

from sklearn.neighbors import KNeighborsRegressor 
from sklearn.tree import DecisionTreeRegressor

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor, VotingRegressor
# Support vector regressor
from sklearn.svm import SVR 
from xgboost.sklearn import XGBRegressor

from sklearn import metrics 
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.cluster import KMeans
from sklearn.utils import resample

from scipy import stats
from scipy.stats import zscore

In [230]:
from pyforest import*
#lazy_imports()

In [231]:
df = pd.read_csv('./data/compresive_strength_concrete.csv')

<IPython.core.display.Javascript object>

In [232]:
df.head()

Unnamed: 0,Cement (component 1)(kg in a m^3 mixture),Blast Furnace Slag (component 2)(kg in a m^3 mixture),Fly Ash (component 3)(kg in a m^3 mixture),Water (component 4)(kg in a m^3 mixture),Superplasticizer (component 5)(kg in a m^3 mixture),Coarse Aggregate (component 6)(kg in a m^3 mixture),Fine Aggregate (component 7)(kg in a m^3 mixture),Age (day),"Concrete compressive strength(MPa, megapascals)"
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [233]:
df.dtypes

Cement (component 1)(kg in a m^3 mixture)                float64
Blast Furnace Slag (component 2)(kg in a m^3 mixture)    float64
Fly Ash (component 3)(kg in a m^3 mixture)               float64
Water  (component 4)(kg in a m^3 mixture)                float64
Superplasticizer (component 5)(kg in a m^3 mixture)      float64
Coarse Aggregate  (component 6)(kg in a m^3 mixture)     float64
Fine Aggregate (component 7)(kg in a m^3 mixture)        float64
Age (day)                                                  int64
Concrete compressive strength(MPa, megapascals)          float64
dtype: object

In [None]:
#'Concrete compressive strength(MPa, megapascals) ', there is a unnecessary space in this column name
# This has been considered during renaming it
df = df.rename({'Cement (component 1)(kg in a m^3 mixture)':'cement', 
                   'Blast Furnace Slag (component 2)(kg in a m^3 mixture)':'slag',
                   'Fly Ash (component 3)(kg in a m^3 mixture)':'ash',
                   'Water  (component 4)(kg in a m^3 mixture)':'water',
                   'Superplasticizer (component 5)(kg in a m^3 mixture)':'plasticizer',
                   'Coarse Aggregate  (component 6)(kg in a m^3 mixture)':'coarse',
                   'Fine Aggregate (component 7)(kg in a m^3 mixture)':'fineAggregate',
                   'Age (day)':'age', 
                   'Concrete compressive strength(MPa, megapascals) ':'strength'},
                   axis=1);

In [None]:
df

In [None]:
df.shape

In [None]:
df.isna().sum()

In [None]:
df.isnull().sum()

In [None]:
df.describe().T


## Exploratory Analysis

In [None]:
q1 = df['cement'].quantile(q=0.25)
q2 = df['cement'].quantile(q=0.5)
q3 = df['cement'].quantile(q=0.75)
print('First quantile is(Q1): {}'.format(q1))
print('Second quantile is(Q2): {}'.format(q2))
print('Third quantile is(Q3): {}'.format(q3))

In [None]:
stats.iqr(df['cement']) # Interquartile range(Q3-Q1)

In [None]:
u_outliers_limit = q3 + 1.5 * stats.iqr(df['cement'])
l_outliers_limit = q1 - 1.5 * stats.iqr(df['cement'])

#df[df['cement'] > u_outliers_limit]['cement'].count()  #counting number of outliers
#df[df['cement'] < l_outliers_limit]['cement'].count()

print((df[df['cement'] > u_outliers_limit]['cement'].count() * 100)/df.shape[0], '%') #printed percentage of outliers
print((df[df['cement'] < l_outliers_limit]['cement'].count() * 100)/df.shape[0], '%')

In [None]:
sns.boxplot(x='cement', data=df, orient='h')

In [None]:
plt.figure(figsize=(10,7))
plt.title('ash')
sns.distplot(df['ash'])

In [None]:
sns.boxplot(df['ash'], orient='h') 

In [None]:
df.dtypes

In [None]:
fig1, ax1 = plt.subplots(3, 3, figsize=(16,16))
sns.distplot(df['cement'], ax=ax1[0][0])
sns.distplot(df['slag'], ax=ax1[0][1])
sns.distplot(df['ash'], ax=ax1[0][2])
sns.distplot(df['water'], ax=ax1[1][0])
sns.distplot(df['plasticizer'], ax=ax1[1][1])
sns.distplot(df['coarse'], ax=ax1[1][2])
sns.distplot(df['fineAggregate'], ax=ax1[2][0])
sns.distplot(df['age'], ax=ax1[2][1])
sns.distplot(df['strength'], ax=ax1[2][2])

In [None]:
sns.pairplot(df, diag_kind='kde');

In [None]:
plt.figure(figsize=(10,7))
sns.heatmap(df.corr(), vmax=1, square=True, annot=True, cmap='viridis')
plt.title('Correlation between different attributes')
plt.show()

In [None]:
plt.figure(figsize=(10,7))
df.boxplot()

In [None]:
# Calculate outliers for each column
# Target column is excluded by "len(df.columns) - 1"
# outliers = ((x - x.mean)/x.std).abs() > 3     # value outside 3 standard deviation 

for col in df.columns[:-1]:
    outliers_index = ((df[col] - df[col].mean())/df[col].std()).abs() > 3;
    
    outliers_no = df.loc[outliers_index, col].count() 
    
    print('Outliers in {}: {}'.format(col, outliers_no))


In [None]:
# Replacing outliers from every column

for col in df.columns[:-1]:
    q1 = df[col].quantile(q=0.25)
    q3 = df[col].quantile(q=0.75)
    
    iqr = q3 - q1
    
    lower_limit = q1 - 1.5 * iqr;
    upper_limit = q3 + 1.5 * iqr;
    
    df.loc[(df[col] < lower_limit) | (df[col] > upper_limit), col] = df[col].median()  

In [None]:
plt.figure(figsize=(10,7))
df.boxplot()

In [None]:
X = df.drop('strength', axis=1)
y = df['strength']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=7)

In [None]:
X_train = X_train.apply(zscore)
X_test = X_test.apply(zscore)
X = X.apply(zscore)

'''
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
X = scaler.fit_transform(X)
'''

In [None]:
# Test k value for KNeighborsRegressor
'''
score_diff_k = []
r2_scores = []
for neighbors in range(1, 20):
    model =  KNeighborsRegressor(n_neighbors=neighbors)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    r2_scr = metrics.r2_score(y_test, y_pred)
    r2_scores.append(r2_scr)

plt.figure(figsize=(10,7))
plt.plot(range(1,20), r2_scores, marker='*')
plt.xlabel('No of neighbors')
plt.ylabel('Error')
'''

In [None]:
# plt.bar(df.columns ,model.feature_importances_)
df.columns

In [None]:
models = []
models.append(('RanFR', RandomForestRegressor()))
models.append(('LinR', LinearRegression()))
models.append(('GrdBR', GradientBoostingRegressor()))
models.append(('AdaBR', AdaBoostRegressor()))
models.append(('KNeiR', KNeighborsRegressor(n_neighbors=4)))
models.append(('BagR', BaggingRegressor()))
models.append(('SVR', SVR(kernel='linear')))
models.append(('XGBR', XGBRegressor()))
models.append(('DecTR', DecisionTreeRegressor()))

In [None]:
train_scores = []
test_scores = []
r2_scores = []
mse_scores = []
cv_scores = []
model_names = []

for name, model in models:
    model.fit(X_train, y_train)
    
    train_scr = model.score(X_train, y_train)
    train_scores.append(train_scr)
    
    y_pred = model.predict(X_test)
    r2_scr = metrics.r2_score(y_test, y_pred)
    r2_scores.append(r2_scr)
    
    mse_scr = metrics.mean_squared_error(y_test, y_pred)
    mse_scores.append(mse_scr)
    
    test_scr = model.score(X_test, y_test) 
    test_scores.append(test_scr)
    
    kfold = KFold(n_splits=10, shuffle=True, random_state=7)
    cv_scr = cross_val_score(model, X, y, cv=kfold)    # It will generate 10 score
    cv_scr_mean = np.mean(abs(cv_scr))
    cv_scores.append(cv_scr_mean)
            
    model_names.append(name)
    

In [None]:
pd.DataFrame({'algorithm':model_names, 'train_scr':train_scores, 'test_scr':test_scores,
             'r2_scr':r2_scores, 'mse_scr':mse_scores, 'kfold':cv_scores})

In [None]:
# Understand the different feature's importance 
'''
model = DecisionTreeRegressor()
model.fit(X_train, y_train)

imp = pd.DataFrame(model.feature_importances_, columns=['importance'], index=X_train.columns)

plt.figure(figsize=(7,4))
plt.barh(imp.index, imp['importance'])
'''

In [None]:
# Selecting important features
df_imp = df.copy()

X = df_imp.drop(['ash', 'plasticizer', 'fineAggregate', 'coarse', 'strength'], axis=1)
y = df_imp['strength']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=7)

X_train = X_train.apply(zscore)
X_test = X_test.apply(zscore)
X = X.apply(zscore)

In [None]:
model = DecisionTreeRegressor()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

train_scr = model.score(X_train, y_train)

r2_scr = metrics.r2_score(y_test, y_pred)

mse_scr = metrics.mean_squared_error(y_test, y_pred)

test_scr = model.score(X_test, y_test) 


In [None]:
test_scr