# Classification

## Fish Data

---

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline 

from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 8
rcParams['font.family'] = 'New Gulim'
rcParams['font.size'] = 20
rcParams['axes.unicode_minus'] = False


In [None]:
# Machine Learning Library

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split

# accuracy measure
from sklearn import metrics

#### 데이터 로드

In [None]:
df = pd.read_csv('data/fish.csv')
df

#### 길이의 제곱 컬럼 생성

In [None]:
df['L2'] = df['Length'] * df['Length']

#### 길이와 무게 비율 컬럼 생성

In [None]:
df['LKgRatio'] = df['Kg'] / df['Length']
df

#### 컬럼 추가(isTuna) - 참치: 1, 나머지: 0

In [None]:
df['isTuna'] = df['Type'].apply(lambda x: 1 if x == 'tuna' else 0)

#### 컬럼 추가( TypeNum) - 참치: 0, 연어: 1, 고등어:2

In [None]:
df['TypeNum'] = df['Type'].apply(lambda x: 0 if x == 'tuna' else 1 if x == 'salmon' else 2)
df

In [None]:
fig = plt.figure(figsize=(12,8))
plt.scatter(df['Length'], df['Kg'], c=df['TypeNum'], s=60)

plt.xlabel('Length')
plt.ylabel('Kg')

plt.show()

# 1. Logistic Regression

In [None]:
#col_list = ['Length','Depth','Kg','L2','LKgRatio']
col_list = ['Length','Depth']

In [None]:
# 데이터 분리: 학습 데이터 + 테스트 데이터
X_train, X_test, y_train, y_test = train_test_split(
    df[col_list], df['isTuna'], random_state=123)

In [None]:
# 모델 생성
model = LogisticRegression()

In [None]:
# 모델 학습
#model.fit(X_train, y_train)
model.fit(X_train, y_train.values.ravel())

In [None]:
# 결과 예측
prediction1 = model.predict(X_test)
prediction1

In [None]:
# 정확도 확인
print('Accuracy - Logistic Regression:', metrics.accuracy_score(prediction1, y_test))

In [None]:
# Score - precision, recall, f1-score
print(metrics.classification_report(y_test, prediction1))

In [None]:
# Confusion Matrix
pd.crosstab(prediction1, y_test, margins=True)

---

# 2. Support Vector Machine

In [None]:
#col_list = ['Length','Depth','Kg','L2','LKgRatio']
col_list = ['Length','Depth']

In [None]:
# 데이터 분리: 학습 데이터 + 테스트 데이터
X_train, X_test, y_train, y_test = train_test_split(df[col_list], df['TypeNum'], random_state=123)

In [None]:
# 모델 생성
model = SVC(kernel='linear', C=0.1, gamma=0.1)

In [None]:
# 모델 학습
#model.fit(X_train, y_train)
model.fit(X_train, y_train.values.ravel())

In [None]:
# 결과 예측
prediction2 = model.predict(X_test)
prediction2

In [None]:
# 정확도 확인
print('Accuracy - SVM:', metrics.accuracy_score(prediction2, y_test))

In [None]:
# Score - precision, recall, f1-score
print(metrics.classification_report(y_test, prediction2))

In [None]:
# Confusion Matrix
pd.crosstab(prediction2, y_test, margins=True)

## 2.1 Support Vector Machine - PCA

In [None]:
col_list = ['Length','Depth','Kg','L2','LKgRatio']

In [None]:
# PCA 모델 생성
pca = PCA(n_components=2)

# PCA Transform
df_pca = pca.fit_transform(df[col_list])
df_pca

In [None]:
# 데이터 분리: 학습 데이터 + 테스트 데이터
X_train, X_test, y_train, y_test = train_test_split(df_pca, df['TypeNum'], random_state=123)

In [None]:
# 모델 생성
model = SVC(kernel='linear', C=0.1, gamma=0.1)

# 모델 학습
model.fit(X_train, y_train.values.ravel())

# 결과 예측
prediction2 = model.predict(X_test)
prediction2

# 정확도 확인
print('Accuracy - SVM:', metrics.accuracy_score(prediction2, y_test))

# Score - precision, recall, f1-score
print(metrics.classification_report(y_test, prediction2))

# Confusion Matrix
pd.crosstab(prediction2, y_test, margins=True)

---

# 3. Decision Tree

In [None]:
col_list = ['Length','Depth','Kg','L2','LKgRatio']

In [None]:
# 데이터 분리: 학습 데이터 + 테스트 데이터
X_train, X_test, y_train, y_test = train_test_split(df[col_list], df['TypeNum'], random_state=123)

In [None]:
# 모델 생성
model = DecisionTreeClassifier()

In [None]:
# 모델 학습
model.fit(X_train, y_train)

In [None]:
# 결과 예측
prediction3 = model.predict(X_test)
prediction3

In [None]:
# 정확도 확인
print('Accuracy - Decision Tree:', metrics.accuracy_score(prediction3, y_test))

In [None]:
# Score - precision, recall, f1-score
print(metrics.classification_report(y_test, prediction3))

In [None]:
# Confusion Matrix
pd.crosstab(prediction3, y_test, margins=True)

---

# 4. Random Forest

In [None]:
col_list = ['Length','Depth','Kg','L2','LKgRatio']

In [None]:
# 데이터 분리: 학습 데이터 + 테스트 데이터
X_train, X_test, y_train, y_test = train_test_split(df[col_list], df['TypeNum'], random_state=123)

In [None]:
# 모델 생성
model = RandomForestClassifier(n_estimators=340)

In [None]:
# 모델 학습
#model.fit(X_train, y_train)
model.fit(X_train, y_train.values.ravel())

In [None]:
# 결과 예측
prediction4 = model.predict(X_test)
prediction4

In [None]:
# 정확도 확인
print('Accuracy - Random Forests:', metrics.accuracy_score(prediction4, y_test))

In [None]:
# Score - precision, recall, f1-score
print(metrics.classification_report(y_test, prediction4))

In [None]:
# Confusion Matrix
pd.crosstab(prediction4, y_test, margins=True)

---

In [None]:
# end of file