In [None]:

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
df = pd.read_csv('../input/concrete-compressive-strength/Concrete Compressive Strength.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

Looks like theres n null values present in the dataset , now to check for outliers

In [None]:
for column in df.columns:
    plt.figure()
    sns.distplot(df[column])

In [None]:
for column in df.columns:
    plt.figure()
    sns.boxplot(df[column])

There looks like theres a few outliers in the dataset ,however there's no need to worry about them as DecisionTreeRegressor is not influenced by outliers.

In [None]:
import plotly.graph_objects as go
corr = df.corr()
graph = go.Figure()
graph.add_trace(go.Heatmap(z=corr.values, x=corr.index.values, y=corr.columns.values))
graph.show()

Now to check for imbalence in the dataset
1. https://pypi.org/project/smogn/

In [None]:
!pip install smogn

In [None]:
import smogn
concrete_smogn = smogn.smoter(
    data = df,       
    y = 'Concrete compressive strength(MPa, megapascals) '  
)

In [None]:
sns.kdeplot(df['Concrete compressive strength(MPa, megapascals) '], label = "Original")
sns.kdeplot(concrete_smogn['Concrete compressive strength(MPa, megapascals) '], label = "Modified")

Above step is not compulsory for regression , however the accuracy of classification problems increases by using SMOTE

In [None]:
#splitting the data
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

Since we are using a decision tree model , we don't need to use StandardScaler

In [None]:
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor()
dtr.fit(X_train, y_train)
dtr.score(X_test, y_test)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test) 

In [None]:
dtr1 = DecisionTreeRegressor()
dtr1.fit(X_train, y_train)
dtr1.score(X_test, y_test)

As we can see there's no major improvement in the performance of the model, now to go for hyperparameter optimisation

In [None]:
from sklearn.model_selection import GridSearchCV

grid_params = {
    'criterion' : [ 'mae' ,'mse', 'friedman_mse','poisson'],
    'splitter' : ['best', 'random'],
    'max_depth' : [3, 5, 7, 9, 10,12,],
    'min_samples_split' : [ 2, 3, 4, 5,7],
    'min_samples_leaf' : [ 2, 3, 4, 5,7]
}

grid_search = GridSearchCV(dtr1, grid_params, cv = 5, n_jobs = -1, verbose = 1)
grid_search.fit(X_train, y_train)

In [None]:
print(grid_search.best_params_)
print(grid_search.best_score_)

An alternative approach to select a base model is to use the lazyregressor from lazypredict 
1. https://pypi.org/project/lazypredict/

In [None]:
!pip install lazypredict

After installing lazypredict you need to restart your kernel otherwise it will not work

In [None]:
from lazypredict.Supervised import LazyRegressor
reg = LazyRegressor(ignore_warnings=False, custom_metric=None)
models, predictions = reg.fit(X_train, X_test, y_train, y_test)
models

As the XGB regressor has the highest Adjusted R-Squared and R - Squared metric

In [None]:
from xgboost import XGBRegressor
xgbr = XGBRegressor(verbosity=0) 
xgbr.fit(X_train,y_train)
#bellow is the Adjusted R-Squared for the model
1 - (1-xgbr.score(X_test, y_test))*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)

Here is a basic approach to regression problems