In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from scipy import stats
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# DATA

**The feature set includes:**
* Cement
* Blast Furnace Slag
* Fly Ash
* Water
* Super-plasticizer
* Coarse Aggregate
* Fine Aggregate
* Age

**The target set is:**
* Strength of the Cement

In [None]:
df = pd.read_csv('../input/regression-with-neural-networking/concrete_data.csv')
df.head()

In [None]:
features = ['Cement','Blast Furnace Slag','Fly Ash','Water','Superplasticizer','Coarse Aggregate','Fine Aggregate','Age']
target = ['Strength']

In [None]:
df.info()

In [None]:
print('Number of missing values in dataset:',df.isnull().sum().sum())

In [None]:
corrMatrix = df[df.columns[0:]].corr()['Strength'][:-1]
corrMatrix = corrMatrix.to_frame()

# DATA DISTRIBUTION

In [None]:
plt.figure(figsize =(10,8))
n = 0
sns.set(style="whitegrid")
for i in features:
    n += 1
    plt.subplot(4,2,n)
    plt.subplots_adjust(hspace = 0.4,wspace= 0.4)
    sns.boxplot(x = df[i])
plt.show()

There are a few outliers in our dataset

## OUTLIER ELIMINATION

In [None]:
df = df[(np.abs(stats.zscore(df)) < 3).all(axis=1)]

In [None]:
plt.figure(figsize =(10,10))
n = 0
sns.set(style="whitegrid")
for i in features:
    n += 1
    plt.subplot(4,2,n)
    plt.subplots_adjust(hspace = 0.4,wspace= 0.4)
    sns.boxplot(x = df[i])
plt.show()

# EDA

In [None]:
plt.figure(1,figsize =(20,6))
sns.set(style="whitegrid")
sns.barplot(x = corrMatrix.index,y = corrMatrix['Strength'],data = corrMatrix)
plt.title('Correlation of Strength to other features')
plt.ylabel('Correlation with Strength')
plt.xlabel('Features')
plt.show()
corrMatrix

It is clear from the graph that : 
* Cement
* Superplasticizer
* Age

Have a positive effect on the Concrete's Strength, while,

* Water has a large negative effect when compared to other negative factors

In [None]:
plt.figure(figsize=(8,8))
sns.pairplot(df[features],palette='coolwarm')
plt.show()

In [None]:
corr = df[features].corr()
plt.figure(figsize=(16,16))
sns.heatmap(corr, cbar = True,  square = True, annot=True, fmt= '.2f',annot_kws={'size': 15},
           xticklabels= features, yticklabels= features, alpha = 0.7,   cmap= 'coolwarm')
plt.show()

None of the variables are strongly correlated...hence PCA cannot be performed

In [None]:
# Scaling the features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Separating out the features
X = df.loc[:, features].values
# Separating out the target
y = df.loc[:,target].values
# Standardizing the features
X = scaler.fit_transform(X)

In [None]:
from sklearn.model_selection import train_test_split as tts
X_train,X_test,y_train,y_test = tts(X,y,test_size = 0.2,random_state = 7)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train,y_train)

In [None]:
from sklearn.metrics import mean_absolute_error as mae
y_pred = lr.predict(X_test)
mae(y_test,y_pred)