In [49]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from math import sqrt
from sklearn import datasets
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [41]:
#Reading data from the dataset
df = pd.read_csv("Fish.csv")

In [3]:
df.describe()

Unnamed: 0,Weight,Length1,Length2,Length3,Height,Width
count,159.0,159.0,159.0,159.0,159.0,159.0
mean,398.326415,26.24717,28.415723,31.227044,8.970994,4.417486
std,357.978317,9.996441,10.716328,11.610246,4.286208,1.685804
min,0.0,7.5,8.4,8.8,1.7284,1.0476
25%,120.0,19.05,21.0,23.15,5.9448,3.38565
50%,273.0,25.2,27.3,29.4,7.786,4.2485
75%,650.0,32.7,35.5,39.65,12.3659,5.5845
max,1650.0,59.0,63.4,68.0,18.957,8.142


In [4]:
df.dtypes

Species     object
Weight     float64
Length1    float64
Length2    float64
Length3    float64
Height     float64
Width      float64
dtype: object

In [5]:
df.head(20)
#We can see that the Species column is a categorical data so we can transform 

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,Bream,242.0,23.2,25.4,30.0,11.52,4.02
1,Bream,290.0,24.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,23.9,26.5,31.1,12.3778,4.6961
3,Bream,363.0,26.3,29.0,33.5,12.73,4.4555
4,Bream,430.0,26.5,29.0,34.0,12.444,5.134
5,Bream,450.0,26.8,29.7,34.7,13.6024,4.9274
6,Bream,500.0,26.8,29.7,34.5,14.1795,5.2785
7,Bream,390.0,27.6,30.0,35.0,12.67,4.69
8,Bream,450.0,27.6,30.0,35.1,14.0049,4.8438
9,Bream,500.0,28.5,30.7,36.2,14.2266,4.9594


In [6]:
def ohe_label_encoding(df,column):
    #Using the label encoder to convert the string values into labels
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    
    
    ohe = OneHotEncoder()
    temp_array = ohe.fit_transform(df[[column]]).toarray()    
    column_names= [ column + "_" + str(m) for m in le.classes_ ]
    
    return( pd.DataFrame(data = temp_array , columns = column_names) ) 

#By this way we can transform a categorical data into machine readable format by using OneHotEncoder of the SkLearn package
#Note the newer versions of SkLearn can automatically convert the categorical data string or other values into OneHotEncodings

In [7]:
numerical_values = ["Length1",  "Length2" , "Length3", "Height" , "Width","Weight"]
categorical_values = ["Species"]

In [8]:
new_df = df[numerical_values]
for column in categorical_values:
    new_df = pd.concat([new_df,ohe_label_encoding(df,column)] , axis  =1 )    
    

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [9]:
new_df.head(10)

Unnamed: 0,Length1,Length2,Length3,Height,Width,Weight,Species_Bream,Species_Parkki,Species_Perch,Species_Pike,Species_Roach,Species_Smelt,Species_Whitefish
0,23.2,25.4,30.0,11.52,4.02,242.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,24.0,26.3,31.2,12.48,4.3056,290.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,23.9,26.5,31.1,12.3778,4.6961,340.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,26.3,29.0,33.5,12.73,4.4555,363.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,26.5,29.0,34.0,12.444,5.134,430.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
5,26.8,29.7,34.7,13.6024,4.9274,450.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
6,26.8,29.7,34.5,14.1795,5.2785,500.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7,27.6,30.0,35.0,12.67,4.69,390.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
8,27.6,30.0,35.1,14.0049,4.8438,450.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
9,28.5,30.7,36.2,14.2266,4.9594,500.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
#We can use the correlation matrix to find the elements
corr = df.corr()

In [11]:
corr

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
Species,1.0,-0.31296,-0.266696,-0.283601,-0.335519,-0.698193,-0.397578
Weight,-0.31296,1.0,0.915712,0.918618,0.923044,0.724345,0.886507
Length1,-0.266696,0.915712,1.0,0.999517,0.992031,0.625378,0.86705
Length2,-0.283601,0.918618,0.999517,1.0,0.994103,0.640441,0.873547
Length3,-0.335519,0.923044,0.992031,0.994103,1.0,0.703409,0.87852
Height,-0.698193,0.724345,0.625378,0.640441,0.703409,1.0,0.792881
Width,-0.397578,0.886507,0.86705,0.873547,0.87852,0.792881,1.0


In [12]:
#Now splitting the dataset for traing and testing purposes
x = new_df[list((new_df.columns))]
y = new_df["Weight"]
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3)

In [13]:
#Using Linear Regresion and Fitting the data

In [14]:
lr = LinearRegression()

In [15]:
lr.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [16]:
y_pred = lr.predict(x_test)
y_pred

array([ 820. ,  300. , 1000. ,  200. ,  725. ,   40. ,  300. ,  514. ,
        188. ,  610. ,  820. ,  145. ,  150. ,  450. ,  700. ,  685. ,
        218. ,  300. ,  850. ,  197. ,  242. , 1000. ,  180. ,  272. ,
       1600. ,  170. ,  290. ,  169. ,   12.2,  260. ,  950. ,  150. ,
        900. ,  363. ,    9.8,  500. ,  340. ,  160. ,  500. ,  290. ,
         69. ,  600. ,  161. ,  300. ,   12.2,  720. ,  135. ,  145. ])

In [None]:
#Linear Regression model returns discrete values (predictions in the form of numbers) thus we can have metrics to judege
# the quality of predictions that our model is making
#https://towardsdatascience.com/regression-an-explanation-of-regression-metrics-and-what-can-go-wrong-a39a9793d914

In [34]:
#Mean squared error is the mean of squares of differences between the predicted and actual values
mse = mean_squared_error(y_test,y_pred)
print( "Mean Squared Error : {0} ".format(mse))

Mean Squared Error : 8.930234139145695e-27 


In [36]:
#Root mean squared error is simply the root of the MSE value
#Usually used when large errors are undesirable
rms = sqrt( mse )
print( "Root Mean Squared Error : {0} ".format(rms))

Root Mean Squared Error : 9.449991608009869e-14 


In [44]:
#Mean absolute error is the mean of the absolute error between the true and the predicted values
mea = mean_absolute_error( y_test,y_pred )
print( "Mean absolute Error : {0} ".format(mea))

Mean absolute Error : 6.724250785813031e-14 


In [53]:
#R squared error or coeffecient of determination is the value with denotes us how much of the variation is described
#by the variation in x; It ranges from -infinity to 1
rsquared = r2_score(y_test,y_pred)
rsquared

1.0