 - Name -- Data Type -- Measurement -- Description
 - Cement (component 1) -- quantitative -- kg in a m3 mixture -- Input Variable
 - Blast Furnace Slag (component 2) -- quantitative -- kg in a m3 mixture -- Input Variable
 - Fly Ash (component 3) -- quantitative -- kg in a m3 mixture -- Input Variable
 - Water (component 4) -- quantitative -- kg in a m3 mixture -- Input Variable
 - Superplasticizer (component 5) -- quantitative -- kg in a m3 mixture -- Input Variable
 - Coarse Aggregate (component 6) -- quantitative -- kg in a m3 mixture -- Input Variable
 - Fine Aggregate (component 7) -- quantitative -- kg in a m3 mixture -- Input Variable
 - Age -- quantitative -- Day (1~365) -- Input Variable
 - Concrete compressive strength -- quantitative -- MPa -- Output Variable

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.simplefilter('ignore')

In [None]:
df = pd.read_csv('../input/concrete-compressive-strength/Concrete Compressive Strength.csv')

In [None]:
df.head()

In [None]:

#Since we would be using these names a lot let us keep shorter names with no capital letters or spaces

df = df.rename(columns = {
    'Cement (component 1)(kg in a m^3 mixture)':'cement',
    'Blast Furnace Slag (component 2)(kg in a m^3 mixture)':'furnace_slag', 
    'Fly Ash (component 3)(kg in a m^3 mixture)':'fly_ash', 
    'Water  (component 4)(kg in a m^3 mixture)':'water', 
    
    'Superplasticizer (component 5)(kg in a m^3 mixture)':'super_plasticizer', 
    'Coarse Aggregate  (component 6)(kg in a m^3 mixture)':'coarse_agg',
    'Fine Aggregate (component 7)(kg in a m^3 mixture)':'fine_agg', 
    'Age (day)':'age', 
    'Concrete compressive strength(MPa, megapascals) ': 'strength'})

df.head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
#check for nulls

print(df.isnull().sum())

In [None]:
plt.figure(figsize=(15,12))
sns.heatmap(round(df.describe()[1:].transpose(),2),linewidth=2,annot=True,fmt="f")
plt.xticks(fontsize=18)
plt.yticks(fontsize=12)
plt.title("Variables summary")
plt.show()

In [None]:
sns.distplot(df['strength'])

In [None]:
import itertools

cols = ['cement', 'furnace_slag', 'fly_ash', 'water', 'super_plasticizer','coarse_agg', 'fine_agg']
length = len(cols)
cs = ["b","r","g","c","m","k","lime"]
fig = plt.figure(figsize=(13,25))

for i,j,k in itertools.zip_longest(cols,range(length),cs):
    plt.subplot(4,2,j+1)
    ax = sns.distplot(df[i],color=k,rug=True)
    ax.set_facecolor("w")
    plt.axvline(df[i].mean(),linestyle="dashed",label="mean",color="k")
    plt.legend(loc="best")
    plt.title(i,color="navy")
    plt.xlabel("")

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(df.corr(),annot=True)

In [None]:
plt.figure(figsize=(12,6))
df.boxplot()
plt.show()

In [None]:
df.head()

In [None]:
df['furnace_slag'] = df['furnace_slag'].replace((0.0),np.nan)
df['furnace_slag'] = df['furnace_slag'].fillna(df['furnace_slag'].median())
df['fly_ash'] = df['fly_ash'].replace((0.0),np.nan)
df['fly_ash'] = df['fly_ash'].fillna(df['fly_ash'].median())
df.head()

### Split

In [None]:
X = df.drop('strength',axis=1)
y = df['strength']

In [None]:
# from sklearn.preprocessing import StandardScaler
# sc = StandardScaler()
# X_scaled = sc.fit_transform(X)

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=45)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.fit_transform(X_test)

### Linear regrssion

In [None]:
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
from math import sqrt

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train_scaled,y_train)

#Predicting the train set results
y_pred_lr = lr.predict(X_train_scaled)
score = r2_score(y_train,y_pred_lr)
print("Score of Training:",100*score)
print("RMSE :" , np.sqrt(mean_squared_error(y_train,y_pred_lr)))

#test set

y_test_pred_lr = lr.predict(X_test_scaled)
#r2 Score
score = r2_score(y_test,y_test_pred_lr)
print("Score of Testing:",100*score)
#RMSE
print("RMSE : " , np.sqrt(mean_squared_error(y_test,y_test_pred_lr)))
#MAE
print("Mean Absolute Error",mean_absolute_error(y_test,y_test_pred_lr))

In [None]:
feature = [[332.5,142.5,0.0,228.0,0.0,932.0,594.0,270]]
feature = sc.fit_transform(feature)
pred = lr.predict(feature)
pred

### LGBM

In [None]:
from lightgbm import LGBMRegressor
lgm = LGBMRegressor(n_estimators=500)
lgm.fit(X_train,y_train)

#Predicting the Test set results
y_pred_lgm = lgm.predict(X_train)
score = r2_score(y_train,y_pred_lgm)
print("Score of Training:",100*score)
print("RMSE :" , np.sqrt(mean_squared_error(y_train,y_pred_lgm)))
y_test_pred_lgm = lgm.predict(X_test)
#r2 Score
score = r2_score(y_test,y_test_pred_lgm)
print("Score of Testing:",100*score)
#RMSE
print("RMSE : " , np.sqrt(mean_squared_error(y_test,y_test_pred_lgm)))
#MAE
print("Mean Absolute Error",mean_absolute_error(y_test,y_test_pred_lgm))

In [None]:
feature = [[332.5,142.5,0.0,228.0,0.0,932.0,594.0,270]]
# feature_scale = sc.fit_transform(feature)
pred = lgm.predict(feature)
pred