# CONCRETE STRENGTH PREDICTION

### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sb
from statsmodels.stats.outliers_influence import variance_inflation_factor
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

### Getting our Data

In [1]:
df = pd.read_csv('../input/concrete/concrete_data.csv')
df

### Data Preprocessing

In [1]:
# checking for null values
df.isnull().any()

In [1]:
# checking vif
variables = df[['Cement','Blast Furnace Slag','Fly Ash','Water','Superplasticizer','Coarse Aggregate','Fine Aggregate','Age']]
vif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(variables.values, i) for i in range(variables.shape[1])]
vif['Features'] = variables.columns

vif

In [1]:
# now, we'll drop columns which have vif>10
df = df.drop(['Cement','Water','Coarse Aggregate','Fine Aggregate'], axis=1)
df

In [1]:
# checking for outliners
sb.boxplot(x=df['Blast Furnace Slag'])

In [1]:
# removing outliners
outliers_removed = [x for x in df['Blast Furnace Slag'] if x >= 300 and x <= 400]
for i in df['Blast Furnace Slag']:
    if i in set(outliers_removed):
        df['Blast Furnace Slag'] = df['Blast Furnace Slag'].replace(i,df['Blast Furnace Slag'].mean())

In [1]:
sb.boxplot(x=df['Blast Furnace Slag'])

In [1]:
sb.boxplot(x=df['Fly Ash'])

In [1]:
sb.boxplot(x=df['Superplasticizer'])

In [1]:
# removing outliners
outliers_removed = [x for x in df['Superplasticizer'] if x >= 25 and x <= 40]
for i in df['Superplasticizer']:
    if i in set(outliers_removed):
        df['Superplasticizer'] = df['Superplasticizer'].replace(i,df['Superplasticizer'].mean())

In [1]:
sb.boxplot(x=df['Superplasticizer'])

In [1]:
sb.boxplot(x=df['Age'])

In [1]:
# removing outliners
outliers_removed = [x for x in df['Age'] if x >= 150 and x <= 400]
for i in df['Age']:
    if i in set(outliers_removed):
        df['Age'] = df['Age'].replace(i,df['Age'].mean())

In [1]:
sb.boxplot(x=df['Age'])

In [1]:
# removing new outliners
outliers_removed = [x for x in df['Age'] if x >= 60 and x <= 150]
for i in df['Age']:
    if i in set(outliers_removed):
        df['Age'] = df['Age'].replace(i,df['Age'].mean())

In [1]:
sb.boxplot(x=df['Age'])

In [1]:
# checking if standardization is required or not
df.describe()

In [1]:
df['Blast Furnace Slag'].plot.hist(grid=True, bins=20, rwidth=0.9, color='#607c8e')
plt.title('Blast Furnace Slag Distribution')
plt.grid(axis='y', alpha=0.75)

In [1]:
# the data seems to be left skewed; we'll have to check the value of skewness

In [1]:
# checking extent of skewness
print('Blast Furnace Slag Skewness:', df['Blast Furnace Slag'].skew())
print('Fly Ash Skewness:', df['Fly Ash'].skew())
print('Superplasticizer Skewness:', df['Superplasticizer'].skew())
print('Age Skewness:', df['Age'].skew())

In [1]:
# as the skewness in within range of acceptance, we do not need to do any transformations

### Data Visualization

In [1]:
# using Pearson's correlation method
corr = df.corr(method='pearson')
sb.heatmap(corr, annot=True)

### Splitting Data for Training and Testing

In [1]:
data = df.values
X,y = data[:,:-1], data[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)  # splitting in the ration 80:20

### Model

In [1]:
model = LinearRegression()

In [1]:
model.fit(X_train, y_train)

### Making Predictions and Checking Accuracy

In [1]:
y_pred = model.predict(X_test)

In [1]:
r2_score(y_test, y_pred)

### With simple Linear Regression, we have achieved 47% accuracy. On applying other algorithms, accuracy of around 80% can be achieved.