# Source of this dataset

https://www.kaggle.com/aungpyaeap/fish-market

# About the data

Fish Market Dataset contains information about common fish species in market sales. The dataset includes the fish species, weight, length, height, and width. 

# Importing Modules

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Loading the data

In [1]:
fish_data = pd.read_csv("../input/fish-market/Fish.csv")

In [1]:
fish_data.head(2)

In [1]:
fish_data.info()

In [1]:
fish_data.groupby('Species').mean()

# Visualizing Data

In [1]:
sns.scatterplot(fish_data['Weight'], fish_data['Height'], hue=fish_data['Species'])

# Splitting the data for training and testing

In [1]:
X = fish_data.drop('Species', axis=1)
y = fish_data['Species']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling

In [1]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss.fit(X_train)
X_train_scaled = ss.transform(X_train)
X_test_scaled = ss.transform(X_test)

# Model Evaluating

In [1]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [1]:
def evaluate_model_performance(y_test, y_pred):
  print(accuracy_score(y_test, y_pred))
  print(confusion_matrix(y_test, y_pred))
  print(classification_report(y_test, y_pred))

# Decision Tree Classification

In [1]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

evaluate_model_performance(y_test, y_pred)

# Random Forest Classification

In [1]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

evaluate_model_performance(y_test, y_pred)

# KNN Classifier

In [1]:
from sklearn.neighbors import KNeighborsClassifier

# Calculating the K value for the best performance
error_rate = []

for i in range(1,40):
    model = KNeighborsClassifier(n_neighbors=i)
    model.fit(X_train_scaled,y_train)
    pred_i = model.predict(X_test_scaled)
    error_rate.append(np.mean(pred_i != y_test))

plt.figure(figsize=(10,6))
plt.plot(range(1,40),error_rate,color='blue', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')

In [1]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)
y_pred = knn.predict(X_test_scaled)

evaluate_model_performance(y_test, y_pred)

In [1]:
comparing_result = pd.DataFrame(columns=['original species', 'predicted species'])
comparing_result['original species'] = y_test
comparing_result['predicted species'] = y_pred

for i in comparing_result.index:
    if comparing_result['original species'][i] != comparing_result['predicted species'][i]:
        print(comparing_result.loc[i])

# Conclusion

We will use KNN Classifier for this data, here we have got highest accuracy of 87%