In [7]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
%matplotlib inline

In [3]:
data = pd.read_csv('./data/abalone.csv',sep=',')
data.head() # print first 5 rows

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [None]:
# data.info() # info about the data

In [None]:
# data.isnull().sum()

In [16]:
# For this data, predict number of rings

# data['Sex'].value_counts()

# string to float so can handle later
lb = LabelEncoder()
data['Sex'] = lb.fit_transform(data['Sex'])
data.head()

data['Rings'].value_counts(sort=True)

9     689
10    634
8     568
11    487
7     391
12    267
6     259
13    203
14    126
5     115
15    103
16     67
17     58
4      57
18     42
19     32
20     26
3      15
21     14
23      9
22      6
24      2
27      2
1       1
25      1
2       1
26      1
29      1
Name: Rings, dtype: int64

In [20]:
# sns.countplot(data['Sex']) plot number of things with each value
# separate data into response and feature

X = data.drop('Rings', axis=1)
y = data['Rings']

# train and test split data
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=.2)

# scaling and optimization...models might not need it

sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
# see data
#x_train[:10]



# Decision Tree Classifier

In [28]:
dtclf = DecisionTreeClassifier()
# train model
dtclf.fit(x_train, y_train)
# test
predict_dtclf = dtclf.predict(x_test)

# how did we do? Comparing y_test and predicted values
print(classification_report(y_test, predict_dtclf,zero_division=1))

cm = accuracy_score(y_test, predict_dtclf)
# last var will print in jupyter
cm


              precision    recall  f1-score   support

           1       1.00      0.00      0.00         1
           3       0.00      0.00      0.00         2
           4       0.43      0.23      0.30        13
           5       0.27      0.30      0.29        20
           6       0.25      0.26      0.26        53
           7       0.16      0.18      0.17        79
           8       0.22      0.24      0.23       117
           9       0.17      0.17      0.17       132
          10       0.21      0.18      0.19       136
          11       0.22      0.26      0.24        92
          12       0.09      0.11      0.10        56
          13       0.06      0.05      0.05        40
          14       0.00      0.00      0.00        27
          15       0.00      0.00      0.00        22
          16       0.10      0.14      0.12         7
          17       0.00      0.00      0.00        10
          18       0.00      0.00      0.00        13
          19       0.00    

0.173444976076555

In [10]:
# how many wrong
print(confusion_matrix(y_test, predict_dtclf))

[[ 2  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 2  4  3  2  2  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1  8  9  8  3  3  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  7 15 10 12  1  2  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  2 19 16 26 10  5  3  1  0  0  2  0  0  0  0  0  0  0  0  0  0]
 [ 0  1  3  7 13 28 21 10  5  6  1  2  1  0  0  0  0  0  1  0  0  0  0]
 [ 0  0  0  2 12 23 38 25 18  6  7  3  3  3  0  0  2  0  0  0  0  0  0]
 [ 0  0  0  1  4 17 31 29 21 10  8  6  5  3  1  1  1  1  0  0  0  0  0]
 [ 0  0  0  1  3  4 22 18 22  6  5  4  3  1  3  0  1  0  0  0  0  0  0]
 [ 0  0  0  0  1  4  5  9  8  5  7  2  0  4  1  0  0  3  0  0  1  1  0]
 [ 0  0  0  1  0  3  4  2  5  4  4  3  2  2  0  0  0  0  0  1  0  0  0]
 [ 0  0  0  0  1  2  5  0  7  4  0  2  1  1  2  1  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  2  2  3  2  0  4  3  2  0  1  0  1  1  0  0  0  0  0]
 [ 0  0  0  0  1  0  0  0  2  1  0  0  0  3  3  0  0  3  0  0  0

# SVM

In [27]:
clf_svm = svm.SVC()
clf_svm.fit(x_train, y_train)
predict_svm = clf_svm.predict(x_test)

# how did we do? Comparing y_test and predicted values
print(classification_report(y_test, predict_svm,zero_division=1))

cm = accuracy_score(y_test, predict_svm)
# last var will print in jupyter
cm

              precision    recall  f1-score   support

           1       1.00      0.00      0.00         1
           3       1.00      0.00      0.00         2
           4       0.44      0.31      0.36        13
           5       0.38      0.30      0.33        20
           6       0.41      0.26      0.32        53
           7       0.31      0.37      0.34        79
           8       0.30      0.30      0.30       117
           9       0.20      0.39      0.27       132
          10       0.23      0.33      0.27       136
          11       0.26      0.32      0.29        92
          12       1.00      0.00      0.00        56
          13       1.00      0.00      0.00        40
          14       1.00      0.00      0.00        27
          15       1.00      0.00      0.00        22
          16       0.00      0.00      0.00         7
          17       0.00      0.00      0.00        10
          18       1.00      0.00      0.00        13
          19       1.00    

0.25598086124401914

# Neural Network

In [25]:
mlpc = MLPClassifier(hidden_layer_sizes=(11,11,11), max_iter=500)
mlpc.fit(x_train, y_train)
predict_mlpc = mlpc.predict(x_test)


In [26]:
# how did we do? Comparing y_test and predicted values
print(classification_report(y_test, predict_mlpc,zero_division=1))

from sklearn.metrics import accuracy_score

cm = accuracy_score(y_test, predict_mlpc)
# last var will print in jupyter
cm

              precision    recall  f1-score   support

           1       1.00      0.00      0.00         1
           3       0.00      0.00      0.00         2
           4       0.46      0.46      0.46        13
           5       0.45      0.25      0.32        20
           6       0.45      0.36      0.40        53
           7       0.33      0.34      0.34        79
           8       0.36      0.43      0.39       117
           9       0.25      0.34      0.29       132
          10       0.25      0.31      0.28       136
          11       0.21      0.30      0.25        92
          12       0.18      0.04      0.06        56
          13       0.16      0.10      0.12        40
          14       0.00      0.00      0.00        27
          15       0.00      0.00      0.00        22
          16       0.12      0.29      0.17         7
          17       0.33      0.10      0.15        10
          18       1.00      0.00      0.00        13
          19       1.00    

0.27631578947368424