In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
df = pd.read_csv(r'E:\classification\fish.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Length1  159 non-null    float64
 1   Length2  159 non-null    float64
 2   Length3  159 non-null    float64
 3   Height   159 non-null    float64
 4   Width    159 non-null    float64
 5   Weight   159 non-null    float64
 6   Species  159 non-null    object 
dtypes: float64(6), object(1)
memory usage: 8.8+ KB


In [4]:
df.shape

(159, 7)

In [5]:
df.isna().sum()

Length1    0
Length2    0
Length3    0
Height     0
Width      0
Weight     0
Species    0
dtype: int64

In [7]:
df.Species.value_counts()

Perch        56
Bream        35
Roach        20
Pike         17
Smelt        14
Parkki       11
Whitefish     6
Name: Species, dtype: int64

In [9]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df.Species = le.fit_transform(df.Species)

In [10]:
df.head(1)

Unnamed: 0,Length1,Length2,Length3,Height,Width,Weight,Species
0,23.2,25.4,30.0,11.52,4.02,242.0,0


In [11]:
df.Species.value_counts()

2    56
0    35
4    20
3    17
5    14
1    11
6     6
Name: Species, dtype: int64

In [13]:
mapping = {2:'Perch', 0:'Bream', 4:'Roach', 3:'Pike', 5:'Smelt', 1:'Parkki', 6:'Whitefish'}

In [18]:
x = df.iloc[:,:6].values
y = df.iloc[:,6].values

In [19]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state=0)

In [20]:
len(x_train), len(x_test), len(y_train), len(y_test)

(119, 40, 119, 40)

### logistic regression

In [24]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=100000)
lr.fit(x_train, y_train)

LogisticRegression(max_iter=100000)

In [25]:
y_pred_lr = lr.predict(x_test)

In [26]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [27]:
confusion_matrix(y_test, y_pred_lr)

array([[ 8,  0,  0,  0,  0,  0,  0],
       [ 0,  3,  0,  0,  0,  0,  0],
       [ 0,  0, 13,  0,  0,  0,  0],
       [ 0,  0,  2,  5,  0,  0,  0],
       [ 0,  0,  0,  0,  3,  1,  1],
       [ 0,  0,  0,  0,  0,  1,  0],
       [ 0,  0,  2,  0,  1,  0,  0]], dtype=int64)

In [28]:
accuracy_score(y_test, y_pred_lr)

0.825

In [30]:
print(classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         8
           1       1.00      1.00      1.00         3
           2       0.76      1.00      0.87        13
           3       1.00      0.71      0.83         7
           4       0.75      0.60      0.67         5
           5       0.50      1.00      0.67         1
           6       0.00      0.00      0.00         3

    accuracy                           0.82        40
   macro avg       0.72      0.76      0.72        40
weighted avg       0.80      0.82      0.80        40



##### grid search on logistic regression

In [32]:
from sklearn.model_selection import GridSearchCV
param = [{'penalty':['l1', 'l2', 'elasticnet', 'none'], 'multi_class':['auto', 'ovr', 'multinomial'], 'verbose':list(range(6))}]

In [33]:
grid_search = GridSearchCV(estimator=lr, param_grid=param, n_jobs=-1, scoring='accuracy')
grid_search = grid_search.fit(x_train,y_train)



In [34]:
best_score = grid_search.best_score_
best_param = grid_search.best_params_

In [35]:
best_score, best_param

(0.9496376811594203, {'multi_class': 'ovr', 'penalty': 'l2', 'verbose': 0})

In [39]:
lr = LogisticRegression(multi_class='ovr', penalty='l2', verbose=0, max_iter=100000)

In [42]:
lr.fit(x_train, y_train)

LogisticRegression(max_iter=100000, multi_class='ovr')

In [43]:
y_pred_lr = lr.predict(x_test)

In [44]:
accuracy_score(y_test, y_pred_lr)

0.875

#### svm

In [45]:
from sklearn.svm import SVC

In [51]:
sv = SVC()
sv.fit(x_train, y_train)

SVC()

In [47]:
y_pred_sv = sv.predict(x_test)

In [48]:
confusion_matrix(y_test, y_pred_sv)

array([[ 7,  0,  1,  0,  0,  0,  0],
       [ 0,  0,  3,  0,  0,  0,  0],
       [ 3,  0, 10,  0,  0,  0,  0],
       [ 1,  0,  6,  0,  0,  0,  0],
       [ 1,  0,  4,  0,  0,  0,  0],
       [ 0,  0,  1,  0,  0,  0,  0],
       [ 1,  0,  2,  0,  0,  0,  0]], dtype=int64)

In [49]:
accuracy_score(y_test, y_pred_sv)

0.425

In [52]:
param = [{'kernel':['linear', 'poly', 'rbf', 'sigmoid'], 'C':[1,2,3,4,5], 'degree':[2,3,4]}]
grid_search = GridSearchCV(estimator=sv, param_grid=param, n_jobs=-1, scoring='accuracy')
grid_search = grid_search.fit(x_train, y_train)



In [53]:
best_score = grid_search.best_score_
best_param = grid_search.best_params_

In [54]:
best_param, best_score

({'C': 3, 'degree': 2, 'kernel': 'linear'}, 0.9246376811594204)

In [56]:
sv = SVC(kernel='linear', C=3.0, degree=2)
sv.fit(x_train, y_train)

SVC(C=3.0, degree=2, kernel='linear')

In [58]:
y_pred_sv = sv.predict(x_test)

In [59]:
confusion_matrix(y_test, y_pred_sv)

array([[ 8,  0,  0,  0,  0,  0,  0],
       [ 0,  3,  0,  0,  0,  0,  0],
       [ 0,  0, 13,  0,  0,  0,  0],
       [ 0,  0,  2,  5,  0,  0,  0],
       [ 0,  0,  0,  0,  3,  1,  1],
       [ 0,  0,  0,  0,  0,  1,  0],
       [ 0,  0,  1,  0,  0,  0,  2]], dtype=int64)

In [60]:
accuracy_score(y_test, y_pred_sv)

0.875

#### cross validation applying on svc

In [62]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator=sv, X=x_train, y=y_train, cv=10)



In [63]:
accuracies

array([0.91666667, 0.91666667, 1.        , 1.        , 0.91666667,
       0.83333333, 0.91666667, 0.83333333, 0.91666667, 1.        ])

In [64]:
accuracies.mean()

0.925

In [65]:
accuracies.std()

0.05833333333333333

## Stochastic Gradient Descent

In [66]:
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier()

In [67]:
sgd.fit(x_train, y_train)

SGDClassifier()

In [68]:
y_pred_sgd = sgd.predict(x_test)

In [69]:
confusion_matrix(y_test, y_pred_sgd)

array([[ 4,  0,  4,  0,  0,  0,  0],
       [ 0,  0,  1,  0,  0,  2,  0],
       [ 0,  0, 13,  0,  0,  0,  0],
       [ 2,  0,  5,  0,  0,  0,  0],
       [ 0,  0,  4,  0,  1,  0,  0],
       [ 0,  0,  0,  0,  1,  0,  0],
       [ 1,  0,  2,  0,  0,  0,  0]], dtype=int64)

In [70]:
accuracy_score(y_test, y_pred_sgd)

0.45

##### grid search over SGD

In [72]:
param = [{'loss':['hinge', 'log', 'modified_huber','squared_hinge', 'perceptron'], 'penalty':['l2', 'l1', 'elasticnet'], 'l1_ratio':[0.05,0.1,0.15,0.2]}]

grid_search = GridSearchCV(estimator=sgd, param_grid=param, n_jobs=-1, scoring='accuracy')
grid_search = grid_search.fit(x_train, y_train)



In [73]:
best_score = grid_search.best_score_
best_param = grid_search.best_params_

In [74]:
best_param, best_score

({'l1_ratio': 0.1, 'loss': 'log', 'penalty': 'l2'}, 0.3963768115942029)