In [1]:
import pandas as pd
import numpy as np
df=pd.read_csv("Fish.csv")
df.head(5)

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,Bream,242.0,23.2,25.4,30.0,11.52,4.02
1,Bream,290.0,24.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,23.9,26.5,31.1,12.3778,4.6961
3,Bream,363.0,26.3,29.0,33.5,12.73,4.4555
4,Bream,430.0,26.5,,34.0,12.444,5.134


In [2]:
feature_columns=["Species", "Length1", "Length2", "Length3", "Height", "Width"]
label_column="Weight"
features=df.loc[:,feature_columns]
label=df.loc[:,label_column]

In [3]:
features.isnull().sum()

Species    1
Length1    1
Length2    1
Length3    1
Height     0
Width      0
dtype: int64

In [4]:
X=features.values
y=label.values

# preprocessing

# fill species
from sklearn.impute import SimpleImputer

species_impute=SimpleImputer(np.nan, strategy="most_frequent")
X[:,0]=species_impute.fit_transform(X[:,0].reshape(-1,1)).ravel()

# fill length1, length2 and length3
continues_value_impute=SimpleImputer(np.nan, strategy="mean")
X[:,1:4]=continues_value_impute.fit_transform(X[:,1:4])

# check
new_df=pd.DataFrame.from_records(X, columns=feature_columns)
new_df.isnull().sum()

Species    0
Length1    0
Length2    0
Length3    0
Height     0
Width      0
dtype: int64

In [5]:
# normalize

from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
X[:,0]=encoder.fit_transform(X[:,0])

In [6]:
# train/test

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.2)

In [7]:
# algorithmn

ones=np.ones((X_train.shape[0],1))
X_bar=np.concatenate((ones,X_train),axis=1)
inverse=np.linalg.pinv(np.asarray(np.dot(X_bar.T,X_bar), dtype="float"))
w=np.dot(np.dot(X_bar.T, y_train), inverse)
w

array([-562.9953178915268, 12.01174544870672, 8.947553846309802,
       -4.565541004999432, 17.440849680821884, 3.262331125286437,
       59.870763867328606], dtype=object)

In [41]:
# check with built-in lib

from sklearn.linear_model import LinearRegression
linear=LinearRegression()
linear.fit(X_train,y_train)
print(linear.coef_)# without w0
print(linear.intercept_)# w0

[  5.28784751 -19.05259282  25.4367109   12.57962381   4.7219619
  69.95924812]
-585.5075694702032


In [48]:
# test

# mean square error
from sklearn.metrics import mean_squared_error, mean_absolute_error
predict_test=linear.predict(X_test)
predict_train=linear.predict(X_train)

print("root mean squared error on train: {}".format(np.sqrt(mean_squared_error(y_train, predict_train))))
print("root mean squared error on test: {}".format(np.sqrt(mean_squared_error(y_test, predict_test))))

root mean squared error on train: 127.03384393492952
root mean squared error on test: 116.25107311854295


In [51]:
# reduce overfitting

# regulazization
from sklearn.linear_model import Ridge, Lasso
# ridge: L2
# lasso: L1
ridge=Ridge(alpha=100)# alpha cang lon cang tang do don gian cua model -> ko mo ta dc du lieu -> tang tinh tong quat -> giam hoc vet
ridge.fit(X_train, y_train)

Ridge(alpha=100, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [50]:
predict_test=ridge.predict(X_test)
predict_train=ridge.predict(X_train)

print("root mean squared error on train: {}".format(np.sqrt(mean_squared_error(y_train, predict_train))))
print("root mean squared error on test: {}".format(np.sqrt(mean_squared_error(y_test, predict_test))))

root mean squared error on train: 127.03992145247892
root mean squared error on test: 115.31177601576466
