In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load



import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import keras
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Compressive Strength Concrete Problem


### Abstract: 

Concrete is the most important material in civil engineering. The concrete compressive strength (concrete strength to bear the load) is a highly nonlinear function of age and ingredients.  <br><br>

<table border="1"  cellpadding="6" bordercolor="red">
	<tbody>
        <tr>
		<td bgcolor="#DDEEFF"><p class="normal"><b>Data Set Characteristics:&nbsp;&nbsp;</b></p></td>
		<td><p class="normal">Multivariate</p></td>
		<td bgcolor="#DDEEFF"><p class="normal"><b>Number of Instances:</b></p></td>
		<td><p class="normal">1030</p></td>
		<td bgcolor="#DDEEFF"><p class="normal"><b>Area:</b></p></td>
		<td><p class="normal">Physical</p></td>
        </tr>
     </tbody>
    </table>
<table border="1" cellpadding="6">
    <tbody>
        <tr>
            <td bgcolor="#DDEEFF"><p class="normal"><b>Attribute Characteristics:</b></p></td>
            <td><p class="normal">Real</p></td>
            <td bgcolor="#DDEEFF"><p class="normal"><b>Number of Attributes:</b></p></td>
            <td><p class="normal">9</p></td>
            <td bgcolor="#DDEEFF"><p class="normal"><b>Date Donated</b></p></td>
            <td><p class="normal">2007-08-03</p></td>
        </tr>
     </tbody>
    </table>
<table border="1" cellpadding="6">	
    <tbody>
    <tr>
		<td bgcolor="#DDEEFF"><p class="normal"><b>Associated Tasks:</b></p></td>
		<td><p class="normal">Regression</p></td>
		<td bgcolor="#DDEEFF"><p class="normal"><b>Missing Values?</b></p></td>
		<td><p class="normal">N/A</p></td>
		<td bgcolor="#DDEEFF"><p class="normal"><b>Number of Web Hits:</b></p></td>
		<td><p class="normal">231464</p></td>
	</tr>
    </tbody>
    </table>

###  Description:
| Features Name | Data Type | Measurement | Description |
| -- | -- | -- | -- |
Cement (component 1) | quantitative | kg in a m3 mixture | Input Variable
Blast Furnace Slag (component 2) | quantitative | kg in a m3 mixture | Input Variable
Fly Ash (component 3) | quantitative | kg in a m3 mixture | Input Variable
Water (component 4) | quantitative | kg in a m3 mixture | Input Variable
Superplasticizer (component 5) | quantitative | kg in a m3 mixture | Input Variable
Coarse Aggregate (component 6) | quantitative | kg in a m3 mixture | Input Variable
Fine Aggregate (component 7) | quantitative | kg in a m3 mixture | Input Variable
Age | quantitative | Day (1~365) | Input Variable
Concrete compressive strength | quantitative | MPa | Output Variable

### WORKFLOW :
- Load Data
- Check Missing Values ( If Exist ; Fill each record with mean of its feature )
- Standardized the Input Variables. **Hint**: Centeralized the data
- Split into 50% Training(Samples,Labels) , 30% Test(Samples,Labels) and 20% Validation Data(Samples,Labels).
- Model : input Layer (No. of features ), 3 hidden layers including 10,8,6 unit & Output Layer with activation function relu/tanh (check by experiment).
- Compilation Step (Note : Its a Regression problem , select loss , metrics according to it)
- Train the Model with Epochs (100) and validate it
- If the model gets overfit tune your model by changing the units , No. of layers , activation function , epochs , add dropout layer or add Regularizer according to the need .
- Evaluation Step
- Prediction


# Load Data
[Click Here to Download DataSet](https://github.com/ramsha275/ML_Datasets/blob/main/compresive_strength_concrete.csv)

In [None]:
df = pd.read_csv('/kaggle/input/compressive-strength-of-concrete/compresive_strength_concrete.csv')

In [None]:
df.head()

In [None]:
df.info()

- There are no missing values 
- There are no categorical variables

In [None]:
df.shape

- 1030 records
- 8 features
- 1 output

# Checking Missing Values

In [None]:
df.isnull().sum() # no missing values

In [None]:
for i in range(len(df.columns)):
    print(df.iloc[:, i].value_counts())
    print('#'*25)

- Superplasticizer
- Fly Ash
- Blast Furnace

These 3 columns have zeros, which should be replaced by mean

In [None]:
df.columns

In [None]:
missing_cols = ['Superplasticizer (component 5)(kg in a m^3 mixture)', 'Fly Ash (component 3)(kg in a m^3 mixture)', 'Blast Furnace Slag (component 2)(kg in a m^3 mixture)']

for col in missing_cols:
    df.loc[:, col] = df.loc[:, col].replace(0.0, df.loc[:, col].mean())

In [None]:
for i in range(len(df.columns)):
    print(df.iloc[:, i].value_counts())
    print('#'*25)

# Univariate Distribution Plots

- These plots will give an idea about
1. Skewness of data
2. Outliers

In [None]:
fig = plt.figure(figsize=(20,25))

sns.set_style('whitegrid')

for i in range(len(df.columns) - 1): # - 1 because I dont want to plot Concrete compressive strength
    fig.add_subplot(3, 3, i+1)
    sns.distplot(df.iloc[:, i].dropna(), rug=True, kde_kws = {'bw':0.1}, color = 'b', kde=False)
    plt.xlabel(df.columns[i])

plt.tight_layout()

- Box plots will give more information about outliers

In [None]:
fig = plt.figure(figsize = (20,25))

sns.set(style = 'darkgrid')

for i in range(len(df.columns)-1):
    fig.add_subplot(3, 3, i+1)
    sns.boxplot(y = df.iloc[:,i].dropna())
plt.tight_layout()

- There are some outliers in Age
- Few outliers are in Superplasticizer, Water

# Bivariate Analysis

- Bivariate Analaysis (Plot of each feature with prediction target) will give a better understanding of how each feature is related to the target (Concrete Compressive Strength)

In [None]:
fig = plt.figure(figsize = (20,25))

sns.set(style = 'whitegrid')

for i in range(len(df.columns)-1):
    fig.add_subplot(3, 3, i+1)
    sns.scatterplot(df.iloc[:,i].dropna(), df.loc[:, 'Concrete compressive strength(MPa, megapascals) '])
plt.tight_layout()
plt.show()


# Taking one final look on outliers - Regression Plots

In [None]:
fig = plt.figure(figsize = (20,30))

for i in range(len(df.columns)-1):
    fig.add_subplot(3, 3, i+1)
    sns.regplot(df.iloc[:,i], df.loc[:, 'Concrete compressive strength(MPa, megapascals) '])
plt.tight_layout()

- It is not mentioned in the assignment to take care of outliers, so I will leave them here for now

# Splitting Data

- Split into 50% Training(Samples,Labels) , 30% Test(Samples,Labels) and 20% Validation Data(Samples,Labels)

In [None]:
# shuffling data
df = df.sample(frac=1).reset_index(drop=True)

# total data = 1030

data = df.drop('Concrete compressive strength(MPa, megapascals) ', 1)
targets = df.loc[:, 'Concrete compressive strength(MPa, megapascals) ']

In [None]:
data.shape

In [None]:
targets.shape

In [None]:
train_data = data.iloc[0:515]
train_targets = targets.iloc[0:515]

val_data = data.iloc[515:721]
val_targets = targets.iloc[515:721]

test_data = data.iloc[721:]
test_targets =targets.iloc[721:]

In [None]:
print(train_data.shape)
print(train_targets.shape)
print('#'*25)
print(val_data.shape)
print(val_targets.shape)
print('#'*25)
print(test_data.shape)
print(test_targets.shape)

# Standardizing input variables

In [None]:
mean = train_data.mean()
std = train_data.std()

train_data -= mean
train_data /= std

val_data -= mean
val_data /= std

test_data -= mean
test_data /= std

In [None]:
train_data.head(2) # TODO : PLOT TO SEE IF DISTRIBUTIONS ARE NOW NORMAL/ GAUSSIAN OR NOT ?!?!?!?!

# Building and Compiling model

In [None]:
from keras import models
from keras import layers

def build_model():
    
    model = models.Sequential()
    
    model.add(layers.Dense(10, activation='relu', input_shape=(train_data.shape[1],)))
    
    model.add(layers.Dense(8, activation='relu'))
        
    model.add(layers.Dense(6, activation='relu'))
    
    model.add(layers.Dense(1))
    
    model.compile(optimizer='Adam', loss='mse', metrics=['mae'])
    
    
    return model
    

# Training the model

In [None]:
model = build_model()

history = model.fit(train_data, train_targets, validation_data = (val_data, val_targets), epochs=100, verbose=1)

# Learning Curves

In [None]:
plt.figure(figsize=(20,15))

mae = history.history['mae']
val_mae = history.history['val_mae']
epochs = range(1, len(mae) + 1)

plt.plot(epochs, mae, 'b--', label='Training mae', linewidth=12)
plt.plot(epochs, val_mae, 'r', label='Validation mae', linewidth=5)

plt.xlabel('Epochs')
plt.ylabel('MAE')
plt.title('Mean Absolute Error')
plt.legend()
plt.show()

- After around 80 epochs, overfitting starts 

# Adding Regularizer

In [None]:
from keras import layers, models, regularizers

def build_model():
    
    model = models.Sequential()
    
    model.add(layers.Dense(10, activation='relu', input_shape=(train_data.shape[1],), kernel_regularizer=regularizers.L2(0.001)))
    
    model.add(layers.Dense(8, activation='relu', kernel_regularizer=regularizers.L2(0.001)))
        
    model.add(layers.Dense(6, activation='relu', kernel_regularizer=regularizers.L2(0.001)))
    
    model.add(layers.Dense(1))
    
    model.compile(optimizer='Adam', loss='mse', metrics=['mae'])
    
    
    return model

In [None]:
model_1 = build_model()

history_1 = model_1.fit(train_data, train_targets, validation_data = (val_data, val_targets), epochs=100, verbose=1)

In [None]:
plt.figure(figsize=(20,15))

mae = history_1.history['mae']
val_mae = history_1.history['val_mae']
epochs = range(1, len(mae) + 1)

plt.plot(epochs, mae, 'b--', label='Training mae', linewidth=12)
plt.plot(epochs, val_mae, 'r', label='Validation mae', linewidth=5)

plt.xlabel('Epochs')
plt.ylabel('MAE')
plt.title('Mean Absolute Error')
plt.legend()
plt.show()

# Increasing Epochs (with Regularizer)

In [None]:
from keras import layers, models, regularizers

def build_model():
    
    model = models.Sequential()
    
    model.add(layers.Dense(10, activation='relu', input_shape=(train_data.shape[1],), kernel_regularizer=regularizers.L2(0.001)))
    
    model.add(layers.Dense(8, activation='relu', kernel_regularizer=regularizers.L2(0.001)))
        
    model.add(layers.Dense(6, activation='relu', kernel_regularizer=regularizers.L2(0.001)))
    
    model.add(layers.Dense(1))
    
    model.compile(optimizer='Adam', loss='mse', metrics=['mae'])
    
    
    return model

In [None]:
model_2 = build_model()

history_2 = model_2.fit(train_data, train_targets, validation_data = (val_data, val_targets), epochs=500, verbose=1)

In [None]:
plt.figure(figsize=(20,15))

mae = history_2.history['mae']
val_mae = history_2.history['val_mae']
epochs = range(1, len(mae) + 1)

plt.plot(epochs, mae, 'b--', label='Training mae', linewidth=12)
plt.plot(epochs, val_mae, 'r', label='Validation mae', linewidth=5)

plt.xlabel('Epochs')
plt.ylabel('MAE')
plt.title('Mean Absolute Error')
plt.legend()
plt.show()

# Adding Dropout


In [None]:
from keras import layers, models, regularizers

def build_model():
    
    model = models.Sequential()
    
    model.add(layers.Dense(10, activation='relu', input_shape=(train_data.shape[1],), ))
    
    model.add(layers.Dropout(0.3))
    
    model.add(layers.Dense(8, activation='relu', ))
        
    model.add(layers.Dropout(0.3))  
        
    model.add(layers.Dense(6, activation='relu', ))
    
    model.add(layers.Dropout(0.3))
    
    model.add(layers.Dense(1))
    
    model.compile(optimizer='Adam', loss='mse', metrics=['mae'])
    
    
    return model

In [None]:
model_3 = build_model()

history_3 = model_3.fit(train_data, train_targets, validation_data = (val_data, val_targets), epochs=100, verbose=1)

In [None]:
plt.figure(figsize=(20,15))

mae = history_3.history['mae']
val_mae = history_3.history['val_mae']
epochs = range(1, len(mae) + 1)

plt.plot(epochs, mae, 'b--', label='Training mae', linewidth=12)
plt.plot(epochs, val_mae, 'r', label='Validation mae', linewidth=5)

plt.xlabel('Epochs')
plt.ylabel('MAE')
plt.title('Mean Absolute Error')
plt.legend()
plt.show()

# Changing Layer Architecture

In [None]:
from keras import layers, models, regularizers

def build_model():
    
    model = models.Sequential()
    
    model.add(layers.Dense(16, activation='relu', input_shape=(train_data.shape[1],), ))
    
   
    
    model.add(layers.Dense(8, activation='relu', ))
        
 
        
    model.add(layers.Dense(3, activation='relu', ))
    
   
    
    model.add(layers.Dense(1))
    
    model.compile(optimizer='Adam', loss='mse', metrics=['mae'])
    
    
    return model

In [None]:
model_4 = build_model()

history_4 = model_4.fit(train_data, train_targets, validation_data = (val_data, val_targets), epochs=100, verbose=1)

In [None]:
plt.figure(figsize=(20,15))

mae = history_4.history['mae']
val_mae = history_4.history['val_mae']
epochs = range(1, len(mae) + 1)

plt.plot(epochs, mae, 'b--', label='Training mae', linewidth=12)
plt.plot(epochs, val_mae, 'r', label='Validation mae', linewidth=5)

plt.xlabel('Epochs')
plt.ylabel('MAE')
plt.title('Mean Absolute Error')
plt.legend()
plt.show()

# Prediction

## Im choosing model_2 (500 epochs, regularizer) for predictions

In [None]:
predictions = model_2.predict(test_data)

In [None]:
predictions.shape

In [None]:
predictions = predictions.reshape(309)

In [None]:
output = pd.DataFrame({'Actual': test_targets})

In [None]:
output['Predicted'] = predictions

In [None]:
output.reset_index(inplace=True)

In [None]:
output['Absolute Error'] = np.abs(output['Predicted'] - output['Actual'])

In [None]:
output

### mean, max error values

-> mean error 5.6
-> median error 4.5
-> min error 0.02
-> max error 26.3

In [None]:
output.loc[:, 'Absolute Error'].describe()