In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
df = pd.read_csv(r'E:/regression_practice/fish.csv')

In [3]:
df.head()

Unnamed: 0,Species,Length1,Length2,Length3,Height,Width,Weight
0,Bream,23.2,25.4,30.0,11.52,4.02,242.0
1,Bream,24.0,26.3,31.2,12.48,4.3056,290.0
2,Bream,23.9,26.5,31.1,12.3778,4.6961,340.0
3,Bream,26.3,29.0,33.5,12.73,4.4555,363.0
4,Bream,26.5,29.0,34.0,12.444,5.134,430.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Species  159 non-null    object 
 1   Length1  159 non-null    float64
 2   Length2  159 non-null    float64
 3   Length3  159 non-null    float64
 4   Height   159 non-null    float64
 5   Width    159 non-null    float64
 6   Weight   159 non-null    float64
dtypes: float64(6), object(1)
memory usage: 8.8+ KB


In [5]:
df.isna().sum()

Species    0
Length1    0
Length2    0
Length3    0
Height     0
Width      0
Weight     0
dtype: int64

In [6]:
df.shape

(159, 7)

In [7]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df.Species = le.fit_transform(df.Species)

In [15]:
df.describe().round(2)

Unnamed: 0,Species,Length1,Length2,Length3,Height,Width,Weight
count,159.0,159.0,159.0,159.0,159.0,159.0,159.0
mean,2.26,26.25,28.42,31.23,8.97,4.42,398.33
std,1.7,10.0,10.72,11.61,4.29,1.69,357.98
min,0.0,7.5,8.4,8.8,1.73,1.05,0.0
25%,1.0,19.05,21.0,23.15,5.94,3.39,120.0
50%,2.0,25.2,27.3,29.4,7.79,4.25,273.0
75%,3.5,32.7,35.5,39.65,12.37,5.58,650.0
max,6.0,59.0,63.4,68.0,18.96,8.14,1650.0


In [8]:
df.head(1)

Unnamed: 0,Species,Length1,Length2,Length3,Height,Width,Weight
0,0,23.2,25.4,30.0,11.52,4.02,242.0


In [13]:
x = df.iloc[:,:6].values
y = df.iloc[:,6].values

In [16]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=11)

In [17]:
len(x_train), len(x_test), len(y_train), len(y_test)

(111, 48, 111, 48)

#### applying linear regression

In [18]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(x_train, y_train)

LinearRegression()

In [19]:
y_pred_lr = lr.predict(x_test)

In [20]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [21]:
r2_score(y_test, y_pred_lr)

0.8639528446412806

In [27]:
mean_absolute_error(y_test, y_pred_lr)

99.62227815053177

In [28]:
(mean_squared_error(y_test, y_pred_lr))**0.5

129.61741850137324

In [35]:
y_pred_lr2 = lr2.predict(x_test)

In [36]:
r2_score(y_test, y_pred_lr2)

0.7662825619860762

##### applying decision tree

In [37]:
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor()

In [38]:
dt.fit(x_train, y_train)

DecisionTreeRegressor()

In [39]:
y_pred_dt = dt.predict(x_test)

In [40]:
r2_score(y_test, y_pred_dt)

0.9541365309421105

In [42]:
mean_absolute_error(y_test, y_pred_dt)

42.77916666666667

In [43]:
(mean_squared_error(y_test, y_pred_dt))**0.5

75.25789825659497

##### trying to apply grid search over decision tree

In [44]:
from sklearn.model_selection import GridSearchCV
param = [{'min_samples_split':list(range(2,5)), 'criterion':['mse', 'friedman_mse', 'mae']}]

In [48]:
import sklearn
sklearn.metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted'])

In [49]:
grid_search = GridSearchCV(estimator=dt, param_grid=param, n_jobs=-1, scoring='r2')
grid_search = grid_search.fit(x_train, y_train)

In [50]:
best_r2 = grid_search.best_score_
best_pram = grid_search.best_params_

In [51]:
best_r2, best_pram

(0.9365251122297625, {'criterion': 'mse', 'min_samples_split': 2})

#### applying random forest

In [52]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=10)
rf.fit(x_train, y_train)

RandomForestRegressor(n_estimators=10)

In [53]:
y_pred_rf = rf.predict(x_test)

In [54]:
r2_score(y_test, y_pred_rf)

0.9700086776754514

In [55]:
mean_absolute_error(y_test, y_pred_rf)

39.64041666666666

In [56]:
(mean_squared_error(y_test, y_pred_rf))**0.5

60.85782711232248

##### trying grid search with random forest

In [60]:
param = [{'n_estimators':list(range(2,25)), 'ccp_alpha':[0.0, 0.1, 0.2, 0.3, 0.4, 0.5]}]

In [61]:
grid_search = GridSearchCV(estimator=rf, param_grid=param, n_jobs=-1, scoring='r2')
grid_search = grid_search.fit(x_train, y_train)

In [62]:
best_r2 = grid_search.best_score_
best_pram = grid_search.best_params_

In [63]:
best_r2, best_pram

(0.966340906398913, {'ccp_alpha': 0.0, 'n_estimators': 3})

### ridge

In [64]:
from sklearn.linear_model import Ridge

rr = Ridge()

In [65]:
rr.fit(x_train, y_train)

Ridge()

In [66]:
y_pred_rr = rr.predict(x_test)

In [67]:
r2_score(y_test, y_pred_rr)

0.8646094421001467

In [68]:
mean_absolute_error(y_test, y_pred_rr)

99.10252818681799

In [69]:
(mean_squared_error(y_test, y_pred_rr))**0.5

129.30425722216225

### Lasso

In [72]:
from sklearn.linear_model import Lasso
ls = Lasso(max_iter=10000)

In [73]:
ls.fit(x_train, y_train)

Lasso(max_iter=10000)

In [74]:
y_pred_ls = ls.predict(x_test)

In [75]:
r2_score(y_test, y_pred_ls)

0.8640267139768223

In [76]:
mean_absolute_error(y_test, y_pred_ls)

99.48182026984041

In [77]:
(mean_squared_error(y_test, y_pred_ls))**0.5

129.5822246283203

#### Elasticnet

In [78]:
from sklearn.linear_model import ElasticNet
en = ElasticNet()

In [79]:
en.fit(x_train, y_train)

ElasticNet()

In [80]:
y_pred_en = en.predict(x_test)

In [81]:
r2_score(y_test, y_pred_en)

0.8547885571120835

In [82]:
mean_absolute_error(y_test, y_pred_en)

104.94158639367588

In [83]:
(mean_squared_error(y_test, y_pred_en))**0.5

133.9118640152986