In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#### Data Characteristics:

The actual concrete compressive strength (MPa) for a given mixture under a
specific age (days) was determined from laboratory. Data is in raw form (not scaled).

Summary Statistics:

Number of instances (observations): 1030
Number of Attributes: 9
Attribute breakdown: 8 quantitative input variables, and 1 quantitative output variable
Missing Attribute Values: None

Variable Information:

Given is the variable name, variable type, the measurement unit and a brief description.
The concrete compressive strength is the regression problem. The order of this listing
corresponds to the order of numerals along the rows of the database.

Name -- Data Type -- Measurement -- Description

Cement (component 1) -- quantitative -- kg in a m3 mixture -- Input Variable

Blast Furnace Slag (component 2) -- quantitative -- kg in a m3 mixture -- Input Variable

Fly Ash (component 3) -- quantitative -- kg in a m3 mixture -- Input Variable

Water (component 4) -- quantitative -- kg in a m3 mixture -- Input Variable

Superplasticizer (component 5) -- quantitative -- kg in a m3 mixture -- Input Variable

Coarse Aggregate (component 6) -- quantitative -- kg in a m3 mixture -- Input Variable

Fine Aggregate (component 7) -- quantitative -- kg in a m3 mixture -- Input Variable

Age -- quantitative -- Day (1~365) -- Input Variable

Concrete compressive strength -- quantitative -- MPa -- Output Variable

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import warnings 
warnings.filterwarnings('ignore')
from sklearn.linear_model import SGDRegressor,GammaRegressor,Lasso,GammaRegressor,ElasticNet,Ridge
from sklearn.linear_model import RANSACRegressor,HuberRegressor, BayesianRidge,LinearRegression
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor, GradientBoostingRegressor, ExtraTreesRegressor  
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor # Decision Tree Regression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline # Streaming pipelines
from sklearn.model_selection import learning_curve, validation_curve, GridSearchCV # Model evaluation
from sklearn.preprocessing import StandardScaler

In [None]:
data=pd.read_csv('/kaggle/input/concrete-compressive-strength/Concrete Compressive Strength.csv')

In [None]:
data

### EXPLORATORY DATA ANALYSIS

In [None]:
data.columns

In [None]:
data.info()

#### all the variable are numeric

In [None]:
data.describe()

In [None]:
data.isnull().sum()

#### no missing is present

### UNIVARIATE ANALYSIS

In [None]:
col=data.columns.to_list()
col

In [None]:
data.hist(figsize=(15,10),color='red')
plt.show()

In [None]:

i=1
plt.figure(figsize = (15,20))
for col in data.columns:
    plt.subplot(4,3,i)
    sns.boxplot(x = data[col], data = data)
    i+=1

#### here we have found some outliers,but we did't remove it due to getting loss of data

### BIVARIATE ANALYSIS

In [None]:
i=1
plt.figure(figsize = (18,18))
for col in data.columns:
    plt.subplot(4,3,i)
    sns.scatterplot(data=data,x='Concrete compressive strength(MPa, megapascals) ',y=col)
    i+=1

#### we can see that compressive strength is highly correlated with cement

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(data.corr(),linewidths=1,cmap='PuBuGn_r',annot=True)

In [None]:
correlation=data.corr()['Concrete compressive strength(MPa, megapascals) '].sort_values()

In [None]:
correlation.plot(kind='barh',color='green')

#### we can see that cement, superplasticizer,age,are +vely correlated, while water ,fine aggregate are negatively correlated with compressive strength.

## MODEL SELECTION

In [None]:
X=data.drop(columns='Concrete compressive strength(MPa, megapascals) ')
Y=data[['Concrete compressive strength(MPa, megapascals) ']]

In [None]:
sc=StandardScaler()
X_scaled=sc.fit_transform(X)
X_scaled=pd.DataFrame(X_scaled,columns=X.columns)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(X_scaled,Y,test_size=.30,random_state=0)

In [None]:
lr=LinearRegression()
sgd=SGDRegressor()
lasso=Lasso()
ridge=Ridge()
rf=RandomForestRegressor()
dt=DecisionTreeRegressor()
gboost=GradientBoostingRegressor()
bagging=BaggingRegressor()
adboost=AdaBoostRegressor()
knn=KNeighborsRegressor()
etr=ExtraTreesRegressor()
gamma=GammaRegressor()

In [None]:
algo=[lr,sgd,lasso,ridge,rf,dt,gboost,bagging,adboost,knn,etr]

In [None]:
model=[]
accuracy_test=[]
accuracy_train=[]
for i in range(len(algo)):
    algo[i].fit(x_train,y_train)
    accuracy_train.append(algo[i].score(x_train,y_train))
    accuracy_test.append(algo[i].score(x_test,y_test))
    model.append(algo[i])
    

In [None]:
mod=pd.DataFrame([model,accuracy_train,accuracy_test]).T
mod.columns=['model','score_train','score_test']
mod

#### we can see that extra tree regressor has the highest accuracy level =90.7%,so we choose for our final model building

### MODEL BUILDING

In [None]:
etr1=ExtraTreesRegressor()

In [None]:
rs=[]
score=[]
for i in range(1,200,1):
    x_train,x_test,y_train,y_test=train_test_split(X_scaled,Y,test_size=.30,random_state=i)
    etr1.fit(x_train,y_train)
    score.append(etr1.score(x_test,y_test))
    rs.append(i)

In [None]:
plt.figure(figsize=(20,6))
plt.plot(rs,score)

In [None]:
for i in range(len(score)):
    print(rs[i],score[i])

#### we can see that at random state =77,we get a accuracy=94.39%

In [None]:
x_train,x_test,y_train,y_test=train_test_split(X_scaled,Y,test_size=.30,random_state=77)
etr2=ExtraTreesRegressor()

In [None]:
etr2.fit(x_train,y_train)
etr2.score(x_train,y_train)

In [None]:
etr2.score(x_test,y_test)

In [None]:
y_test_pred=etr2.predict(x_test)

In [None]:
y_test1=y_test.copy()
y_test1['pred']=y_test_pred

In [None]:
y_test1.corr()

#### we can see here the accuracy is to be 97.17%

In [None]:
from sklearn.metrics import  mean_squared_error,r2_score

In [None]:
mean_squared_error(y_test1[ 'Concrete compressive strength(MPa, megapascals) '],y_test1['pred'])

In [None]:
rsme=np.sqrt(mean_squared_error(y_test1[ 'Concrete compressive strength(MPa, megapascals) '],y_test1['pred']))
rsme

#### we can see that root mean sqaure error is only 4.15 , which shows that our model is very good

In [None]:
r2_score(y_test1[ 'Concrete compressive strength(MPa, megapascals) '],y_test1['pred'])

In [None]:
plt.barh(X.columns,etr2.feature_importances_)

#### we can also see that age and cement is most important feature 