## **Cement Strength Prediction Model**
### **Author : Sujal Karbhari**

In [1]:
# Import Data Manipulation Library
import pandas as pd
import numpy as np

# Import Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Import Machine Learning Libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,RobustScaler,LabelEncoder,OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor,BaggingRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

In [11]:
# Data Ingestion

def data_ingestion():
    try:
        df = pd.read_csv(r'C:\CementStrength_PredictionModel\data\raw\Concrete_Compressive_Strength.csv')
        print("Data Ingestion Successful")
    except:
        print("Data Ingestion Unsuccessful")
    return df

In [13]:
# data exploration
from collections import OrderedDict
numerical_stats = []

Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
LW = Q1 - 1.5 * IQR
UW = Q3 + 1.5 * IQR
Outlier_Count = ((df < LW) | (df > UW)).sum()
Outlier_Percentage = (Outlier_Count / df.shape[0]) * 100

def data_exploration(df):
    for i in df:
        
        num_stats = OrderedDict({
            "features": i,
            "Maximum": df[i].max(),
            "Minimum": df[i].min(),
            "Mean": df[i].mean(),
            "Median": df[i].median(),
            "Q1": Q1[i],
            "Q3": Q3[i],
            "IQR": IQR[i],
            "Lower Whisker": LW[i],
            "Upper Whisker": UW[i],
            "Outlier Count": Outlier_Count[i],
            "Outlier Percentage": Outlier_Percentage[i],
            "Standard Deviation": df[i].std(),
            "Skewness": df[i].skew(),
            "Kurtosis": df[i].kurtosis()
            })
        numerical_stats.append(num_stats)
        numerical_stats_report = pd.DataFrame(numerical_stats)
    return numerical_stats_report

numerical_stats_report = data_exploration(df)
numerical_stats_report

Unnamed: 0,features,Maximum,Minimum,Mean,Median,Q1,Q3,IQR,Lower Whisker,Upper Whisker,Outlier Count,Outlier Percentage,Standard Deviation,Skewness,Kurtosis
0,cement,540.0,102.0,281.167864,272.9,192.375,350.0,157.625,-44.0625,586.4375,0,0.0,104.506364,0.509481,-0.520652
1,slag,359.4,0.0,73.895825,22.0,0.0,142.95,142.95,-214.425,357.375,2,0.194175,86.279342,0.800717,-0.508175
2,ash,200.1,0.0,54.18835,0.0,0.0,118.3,118.3,-177.45,295.75,0,0.0,63.997004,0.537354,-1.328746
3,water,247.0,121.8,181.567282,185.0,164.9,192.0,27.1,124.25,232.65,9,0.873786,21.354219,0.074628,0.122082
4,superplastic,32.2,0.0,6.20466,6.4,0.0,10.2,10.2,-15.3,25.5,10,0.970874,5.973841,0.907203,1.411269
5,coarseagg,1145.0,801.0,972.918932,968.0,932.0,1029.4,97.4,785.9,1175.5,0,0.0,77.753954,-0.04022,-0.599016
6,fineagg,992.6,594.0,773.580485,779.5,730.95,824.0,93.05,591.375,963.575,5,0.485437,80.17598,-0.25301,-0.102177
7,age,365.0,1.0,45.662136,28.0,7.0,56.0,49.0,-66.5,129.5,59,5.728155,63.169912,3.269177,12.168989
8,strength,82.6,2.33,35.817961,34.445,23.71,46.135,22.425,-9.9275,79.7725,4,0.38835,16.705742,0.416977,-0.313725
9,water_cement_ratio,1.882334,0.266892,0.748263,0.675346,0.533332,0.93516,0.401828,-0.069409,1.537901,18,1.747573,0.314003,0.958065,0.734109


In [14]:
# Dataset Information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 14 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   cement                    1030 non-null   float64
 1   slag                      1030 non-null   float64
 2   ash                       1030 non-null   float64
 3   water                     1030 non-null   float64
 4   superplastic              1030 non-null   float64
 5   coarseagg                 1030 non-null   float64
 6   fineagg                   1030 non-null   float64
 7   age                       1030 non-null   int64  
 8   strength                  1030 non-null   float64
 9   water_cement_ratio        1030 non-null   float64
 10  total_binder              1030 non-null   float64
 11  aggregate_to_cement       1030 non-null   float64
 12  cement_water_interaction  1030 non-null   float64
 13  age_strength_proxy        1030 non-null   float64
dtypes: float

In [16]:
# Data Preprocessing

def data_preprocessing(df):
        
    X = df.drop(columns = 'strength',)
    y = df['strength']

    # Use Train Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                            test_size=0.3,
                                                            random_state=1)
        
    # Use Scaling Techniques
    rs = RobustScaler().fit(X_train,y_train)
    X_train= rs.fit_transform(X_train)
    X_test= rs.transform(X_test)
        
    return X_train, X_test, y_train, y_test

In [18]:
# Model Building
def model_building(Xdf):
    
    models = {
        "Linear Regression": LinearRegression(),
        "Decision Tree": DecisionTreeRegressor(),
        "Random Forest": RandomForestRegressor(),
        "Gradient Boosting": GradientBoostingRegressor(),
        "Bagging Regressor": BaggingRegressor(),
        "AdaBoost Regressor": AdaBoostRegressor(),
        "Support Vector Regressor": SVR(),
        "K-Neighbors Regressor": KNeighborsRegressor()
        }
    
    return models

In [22]:
# Model Evaluation
def model_evaluation(models, X_train, X_test, y_train, y_test):
    r2scores = {}

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        r2scores[model_name] = r2_score(y_test, y_pred)
    return r2scores

In [23]:
r2scores = model_evaluation(models, X_train, X_test, y_train, y_test)
r2scores

{'Linear Regression': 0.8173690536956675,
 'Decision Tree': 0.8578198765881846,
 'Random Forest': 0.9096365295022435,
 'Gradient Boosting': 0.9084657510517422,
 'Bagging Regressor': 0.8987873538371791,
 'AdaBoost Regressor': 0.7973356980827431,
 'Support Vector Regressor': 0.7671012945149633,
 'K-Neighbors Regressor': 0.8077377472879232}