In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

At this model I want to predict strength of concrete by looking all of the variables.

# **Reading Data**

In [None]:
data=pd.read_csv("../input/concrete-compressive-strength-data-set/concrete_data.csv")

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.columns

One column name is "fine_aggregate ", we have an extra space. We need to get rid of it.

In [None]:
data=data.rename({"fine_aggregate ":"fine_aggregate"},axis=1)

In [None]:
data["fly_ash"].unique()

I wanted to be sure if fly_ash is numerical variable. And as we can see all the data contains numerical variables.

In [None]:
data.dtypes

All of them are in correct type. We don't have to worry about it.

In [None]:
data.isnull().sum()

It is nice because there is no any null value.

In [None]:
def outlier_graph(data,column):
    plt.figure(figsize=(5,3))
    sns.boxplot(data[column])
    plt.title("{} distribution".format(column))

To see outliers I made a function.

In [None]:
for i in data.columns:
    outlier_graph(data,i)


There are some outliers that we need to get rid of.

In [None]:
def min_max_show(data,column):
    print("min value of {} is {} \nmax value of {} is {}".format(column,data[column].min(),column,data[column].max()))

It will show us the min and max values and it will make easy to erase them.

In [None]:
for i in data.columns:
    min_max_show(data,i)

In [None]:
data=data[data["blast_furnace_slag"]<350]
data=data[(data["water"]<246) & (data["water"]>122)]
data=data[data["superplasticizer"]<25]
data=data[data["fine_aggregate"]<992]
data=data[data["age"]<150]

I didn't put concrete_compressive_strength since its outliers are very close to normal range. I just erased the far ones.

# **Data Visualization**

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(data.corr(),annot=True)

fly_ash is extremely low correlated with the strength. But age,cement and water is correlated with the strength.

In [None]:
data.drop(["fly_ash"],axis=1,inplace=True)

Since it does not have correlation with strength I dropped it.

In [None]:
for i in data.columns:
    for j in data.columns:
        plt.figure(figsize=(9,7))
        sns.scatterplot(x=i,y=j,hue="concrete_compressive_strength",data=data)
        plt.show()
        


In [None]:
data.columns

When I examine the graphs I saw between fine_aggregate,coarse_aggregate and blast_furnace_slag variables and concrete_compressive_strength correlation is very low as we can see from scatter graphs so it will be better if we drop them.

In [None]:
data.drop(["blast_furnace_slag"],axis=1,inplace=True)
data.drop(["coarse_aggregate"],axis=1,inplace=True)
data.drop(["fine_aggregate"],axis=1,inplace=True)

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(data.corr(),annot=True)

It is better.

# Modelling

In [None]:
x=data.drop(["concrete_compressive_strength"],axis=1)
y=data["concrete_compressive_strength"]

First I will split my x and y

In [None]:
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)

Then I will split my data as test and train.

In [None]:
Model_Names=["Linear Regression","Polynomial2","Polynomial3","Polynomial4","Random Forest","Decision Tree"]
Scores=[]

I will try 6 models and take the best one. I will just do simple thing don't go for parameters or kfold.

In [None]:
lr=LinearRegression()
lr.fit(x_train,y_train)
y_head=lr.predict(x_test)
Scores.append(r2_score(y_test,y_head))

In [None]:
lr2=LinearRegression()
for i in np.arange(2,5): 
    pl=PolynomialFeatures(degree=i)
    x_polly=pl.fit_transform(x_train)
    x_polly_test=pl.fit_transform(x_test)
    lr2.fit(x_polly,y_train)
    y_head=lr2.predict(x_polly_test)
    Scores.append(r2_score(y_test,y_head))

In [None]:
rf=RandomForestRegressor(n_estimators=100,random_state=42)
rf.fit(x_train,y_train)
y_head=rf.predict(x_test)
Scores.append(r2_score(y_test,y_head))

In [None]:
dt=DecisionTreeRegressor()
dt.fit(x_train,y_train)
y_head=dt.predict(x_test)
Scores.append(r2_score(y_test,y_head))

In [None]:
graph_data= pd.DataFrame(list(zip(Model_Names,Scores)),columns =['Models', 'Scores']) 
plt.figure(figsize=(10,6))
sns.barplot(x=graph_data["Models"],y=graph_data["Scores"])

As we can see Random Forest gave the best score so we can use it.

In [None]:
rf2=RandomForestRegressor(n_estimators=100,random_state=42)
rf2.fit(x_train,y_train)
y_head=rf2.predict(x_test)
print(r2_score(y_test,y_head))

Its score is 81.74% and I think it is not bad.

In [None]:
sns.scatterplot(x=y_test,y=y_head)
plt.xlabel("Real Data")
plt.ylabel("Predicted Data")

When we look this scatter plot we can see the linearity, it is not perfect but can still work.

In [None]:
sns.distplot(y_head,label="Predicted")
sns.distplot(y_test,label="Real")
plt.legend()

Still it is not the best model but I think it still works.