In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
%matplotlib inline

In [3]:
data = pd.read_csv('../input/predict-concrete-strength/ConcreteStrengthData.csv')

In [4]:
data.info()

In [5]:
data.describe()

In [6]:
data.shape

In [7]:
data.isnull().sum()

In [8]:
data.corr()

In [9]:
plt.figure(figsize=(8,4))
sns.heatmap(data.corr(), annot=True)

In [10]:
data.columns

In [11]:
fig=sns.pairplot(data,height=2.0, aspect=1.8, 
                plot_kws={'edgecolor': 'k', 'linewidth': 1.0},
                diag_kind='kde', diag_kws={'shade': True})


In [12]:
plt.figure(figsize=(12,12),facecolor='white')
plotnumber=1
for column in data.columns:
    plt.subplot(3,3,plotnumber)
    sns.distplot(data[column])
    plt.xlabel(column,fontsize=10)
    plotnumber+=1
plt.show()

In [14]:
x = data.drop('Strength', axis=1)

In [15]:
x

In [16]:
y = data['Strength']

In [17]:
y

In [18]:
for column in x.columns:
    x[column]+=1
    x[column]= np.log(x[column])

In [19]:
x

In [20]:
plt.figure(figsize=(12,12),facecolor='white')
plotnumber = 1
for column in x.columns:
    plt.subplot(3,3,plotnumber)
    sns.distplot(x[column])
    plt.xlabel(column, fontsize=13)
    plotnumber+=1
plt.show()

In [21]:
plt.figure(figsize=(12,12),facecolor='white')
plotnumber=1
for column in x.columns:
    plt.subplot(3,3,plotnumber)
    sns.boxplot(x[column])
    plt.xlabel(column)
    plotnumber+=1
plt.show()

In [22]:
plt.figure(figsize=(12,12),facecolor='white')
plotnumber=1
for column in x.columns:
    plt.subplot(3,3,plotnumber)
    sns.scatterplot(x[column],y)
    plt.xlabel(column)
    plotnumber+=1
plt.show()

In [23]:
plt.figure(figsize=(10,6),facecolor='white')
sns.heatmap(x.corr(), annot=True)

In [24]:
x_train,x_test,y_train,y_test= train_test_split(x,y,test_size=0.3,random_state=42)

In [25]:
x_train

In [26]:
x_test

In [27]:
y_train

In [28]:
y_test

In [31]:
scalar = StandardScaler()

In [32]:
X_train = scalar.fit(x_train)

In [33]:
X_train = scalar.transform(x_train)

In [34]:
X_test = scalar.transform(x_test)

In [35]:
X_train= pd.DataFrame(X_train,columns=x_train.columns)

In [36]:
X_train

In [37]:
X_test= pd.DataFrame(X_test,columns=x_test.columns)

In [38]:
X_test

In [39]:
print(X_train.mean())

In [40]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.svm import SVR, LinearSVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor

In [41]:
models = {
    "                     Linear Regression" : LinearRegression(),
    "                 L2 (Ridge) Regression" : Ridge(),
    "Support Vector Machine (Linear Kernel)" : LinearSVR(),
    "   Support Vector Machine (RBF Kernel)" : SVR(),
    "                         Decision Tree" : DecisionTreeRegressor(),
    "                        Neural Network" : MLPRegressor(),
    "                         Random Forest" : RandomForestRegressor(),
    "                     Gradient Boosting" : GradientBoostingRegressor(),
    "                              AdaBoost" : AdaBoostRegressor()
}

In [42]:
for name,model in models.items():
    model.fit(X_train,y_train)
    print('model trained with. {}'.format(name))

In [43]:
from sklearn.metrics import r2_score

In [44]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    score = r2_score(y_test,y_pred)
    print(score)

In [45]:
best_model = GradientBoostingRegressor()
best_model.fit(X_train,y_train)
y_pred = best_model.predict(X_test)
score = r2_score(y_test,y_pred)
print ('score before hyperparameter tuning {}'.format(score))

In [46]:
param = {
    'learning_rate' : [0.01, 0.1, 1.0],
    'n_estimators' : [100, 150, 200],
    'max_depth' : [3, 4, 5]
}

In [47]:
clf = GridSearchCV(estimator=best_model,param_grid=param)
clf.fit(X_train,y_train)
#best_accuracy = clf.best_score_
best_param = clf.best_params_

In [48]:
#print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print(best_param)

In [49]:
y_pred = clf.predict(X_test)
score = r2_score(y_test,y_pred)
print ('score after hyperparameter tuning {}'.format(score))