In [77]:
from sklearn import svm
import pandas as pd
import math


## Make a simple sklearn classifier
First, read the data in using `pandas.read_csv()`.
Note that the final column contains the `class_type` field that we are interested in.

In [78]:
data = pd.read_csv("class.csv")

data.head(8)

Unnamed: 0,Class_Number,Number_Of_Animal_Species_In_Class,Class_Type,Animal_Names
0,1,41,Mammal,"aardvark, antelope, bear, boar, buffalo, calf,..."
1,2,20,Bird,"chicken, crow, dove, duck, flamingo, gull, haw..."
2,3,5,Reptile,"pitviper, seasnake, slowworm, tortoise, tuatara"
3,4,13,Fish,"bass, carp, catfish, chub, dogfish, haddock, h..."
4,5,4,Amphibian,"frog, frog, newt, toad"
5,6,8,Bug,"flea, gnat, honeybee, housefly, ladybird, moth..."
6,7,10,Invertebrate,"clam, crab, crayfish, lobster, octopus, scorpi..."


In [79]:
data.shape

(7, 4)

## Preprocess the data
Split the data up for training and evaluation.

In [80]:
def preprocess(data):
    X = data.iloc[:, 1:4]  # all rows, all the features and no labels
    y = data.iloc[:, 0:6]  # all rows, label only

    return X, y

In [81]:
# Shuffle and split the dataset
# We don't need to use this any more, thanks to scikit-learn!
# Why shuffle and split the data? -  The ideas of overfitting and underfitting
#In general, splits are random, (e.g. train_test_split) which is equivalent to shuffling and selecting the first X % of the data. 
#When the splitting is random, you don't have to shuffle it beforehand.If you don't split randomly, 
#your train and test splits might end up being biased. For example, 
#if you have 100 samples with two classes and your first 80 samples are 
#from class 1, and remaining are from class 0, a 80/20 split would 
#leave all class 0's in the training set and all class 1's in the test set. 
#This way, in the training, you wouldn't see any examples from class 1 and cannot learn.4
# Source - https://stats.stackexchange.com/questions/467816/what-is-the-advantage-of-shuffling-data-in-train-test-split
#Sometimes, it's even helpful to shuffle after the splits, e.g. in neural nets, to keep the parameters inside a reasonable subset.


data = data.sample(frac=1).reset_index(drop=True)
data_total_len = data[data.columns[0]].size

data_train_frac = 0.9
split_index = math.floor(data_total_len*data_train_frac)

train_data = data.iloc[:split_index]
eval_data = data.iloc[split_index:]

Split the data using scikit-learn instead, using fewer lines!

In [82]:
from sklearn.model_selection import train_test_split

all_X, all_y = preprocess(data)
X_train, X_test, y_train, y_test = train_test_split(all_X, all_y)

In [83]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
#Source What is the difference x-test-x-train-y-test-y-train - https://stackoverflow.com/questions/60636444/what-is-the-difference-between-x-test-x-train-y-test-y-train-in-sklearn

(5, 3) (2, 3) (5, 4) (2, 4)


In [84]:
X_train

Unnamed: 0,Number_Of_Animal_Species_In_Class,Class_Type,Animal_Names
0,10,Invertebrate,"clam, crab, crayfish, lobster, octopus, scorpi..."
6,4,Amphibian,"frog, frog, newt, toad"
1,5,Reptile,"pitviper, seasnake, slowworm, tortoise, tuatara"
4,41,Mammal,"aardvark, antelope, bear, boar, buffalo, calf,..."
5,13,Fish,"bass, carp, catfish, chub, dogfish, haddock, h..."


In [85]:
y_train

Unnamed: 0,Class_Number,Number_Of_Animal_Species_In_Class,Class_Type,Animal_Names
0,7,10,Invertebrate,"clam, crab, crayfish, lobster, octopus, scorpi..."
6,5,4,Amphibian,"frog, frog, newt, toad"
1,3,5,Reptile,"pitviper, seasnake, slowworm, tortoise, tuatara"
4,1,41,Mammal,"aardvark, antelope, bear, boar, buffalo, calf,..."
5,4,13,Fish,"bass, carp, catfish, chub, dogfish, haddock, h..."


In [86]:
X_test

Unnamed: 0,Number_Of_Animal_Species_In_Class,Class_Type,Animal_Names
2,8,Bug,"flea, gnat, honeybee, housefly, ladybird, moth..."
3,20,Bird,"chicken, crow, dove, duck, flamingo, gull, haw..."


In [87]:
y_test

Unnamed: 0,Class_Number,Number_Of_Animal_Species_In_Class,Class_Type,Animal_Names
2,6,8,Bug,"flea, gnat, honeybee, housefly, ladybird, moth..."
3,2,20,Bird,"chicken, crow, dove, duck, flamingo, gull, haw..."


## Train and Evaluate the model
It's easy to swap in a different model of your choice.

In [None]:
clf = svm.SVC()
#
clf.fit(X_train, y_train)  

In [None]:
clf.score(X_test, y_test)
#https://stackoverflow.com/questions/55971187/what-does-clf-scorex-train-y-train-evaluate-in-decision-tree

## Predict on some new data
We can predict new values with a one line call.

In [None]:
clf.predict(X_test[15:25])

In [None]:
# Show what the correct answer is
y_test[10:15]