In [22]:
import pandas as pd
import numpy as np
import shutil
import tempfile
from urllib.request import urlretrieve
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier

import warnings
warnings.filterwarnings('ignore')
temp_data = tempfile.mkdtemp()
path = 'http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data'
dataset = temp_data + '/abalone.data'
urlretrieve(path,dataset)

dataset_new = pd.read_csv(dataset,sep=',',header=None)
shutil.rmtree(temp_data)
dataset_new.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [23]:
dataset_new.describe()

Unnamed: 0,1,2,3,4,5,6,7,8
count,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0
mean,0.523992,0.407881,0.139516,0.828742,0.359367,0.180594,0.238831,9.933684
std,0.120093,0.09924,0.041827,0.490389,0.221963,0.109614,0.139203,3.224169
min,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,1.0
25%,0.45,0.35,0.115,0.4415,0.186,0.0935,0.13,8.0
50%,0.545,0.425,0.14,0.7995,0.336,0.171,0.234,9.0
75%,0.615,0.48,0.165,1.153,0.502,0.253,0.329,11.0
max,0.815,0.65,1.13,2.8255,1.488,0.76,1.005,29.0


In [24]:
dataset_new.shape

(4177, 9)

In [25]:
X = dataset_new.values[:,1:8]
Y = dataset_new.values[:,0]
X_train,X_test,Y_train,Y_test = train_test_split(X,Y)
print('X:','\n',X,'\n')
print('Y:','\n',Y,'\n')
print('X train:','\n',X_train,'\n')
print('Y train:','\n',Y_train,'\n')
print('X test:','\n',X_test,'\n')
print('Y test:','\n',Y_test,'\n')

X: 
 [[0.455 0.365 0.095 ... 0.2245 0.10099999999999999 0.15]
 [0.35 0.265 0.09 ... 0.0995 0.0485 0.07]
 [0.53 0.42 0.135 ... 0.2565 0.1415 0.21]
 ...
 [0.6 0.475 0.205 ... 0.5255 0.2875 0.308]
 [0.625 0.485 0.15 ... 0.531 0.261 0.29600000000000004]
 [0.71 0.555 0.195 ... 0.9455 0.3765 0.495]] 

Y: 
 ['M' 'M' 'F' ... 'M' 'F' 'M'] 

X train: 
 [[0.625 0.5 0.18 ... 0.645 0.303 0.3705]
 [0.655 0.54 0.175 ... 0.7285 0.402 0.385]
 [0.49 0.37 0.105 ... 0.249 0.1005 0.14800000000000002]
 ...
 [0.395 0.295 0.095 ... 0.115 0.0625 0.085]
 [0.57 0.44 0.175 ... 0.3805 0.2285 0.28300000000000003]
 [0.66 0.53 0.185 ... 0.546 0.2705 0.47600000000000003]] 

Y train: 
 ['M' 'F' 'F' ... 'I' 'M' 'F'] 

X test: 
 [[0.7 0.545 0.185 ... 0.75 0.4035 0.3685]
 [0.725 0.565 0.215 ... 0.6975 0.4725 0.58]
 [0.5 0.385 0.115 ... 0.2945 0.138 0.195]
 ...
 [0.36 0.265 0.085 ... 0.0725 0.0515 0.055]
 [0.505 0.405 0.14 ... 0.2665 0.174 0.285]
 [0.545 0.435 0.135 ... 0.37200000000000005 0.14800000000000002
  0.226999999

In [26]:
#gini and entropy classifiers
gini = DecisionTreeClassifier(min_samples_leaf=5,random_state=100,max_depth=3)
print(gini.fit(X_train,Y_train))
entropy = DecisionTreeClassifier(criterion='entropy',min_samples_leaf=5,random_state=100,max_depth=3)
print(entropy.fit(X_train,Y_train))

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=100,
            splitter='best')
DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=100,
            splitter='best')


In [27]:
#feature importance
print(gini.feature_importances_)
print(entropy.feature_importances_)

[0.         0.         0.12106273 0.07725233 0.01440962 0.78727533
 0.        ]
[0.         0.         0.13442557 0.82919493 0.0112251  0.02515441
 0.        ]


In [28]:
#score
print(gini.score(X_train,Y_train))
print(entropy.score(X_train,Y_train))

0.5440613026819924
0.545338441890166


In [29]:
Y_pred_gini = gini.predict(X_test)
Y_pred_entropy = entropy.predict(X_test)
print('Y prediction gini:',Y_pred_gini,'\n')
print('*'*50)
print('Y test:',Y_test,'\n')
print('*'*50)
print('Y prediction entropy:',Y_pred_entropy,'\n')

Y prediction gini: ['F' 'F' 'I' ... 'I' 'M' 'M'] 

**************************************************
Y test: ['M' 'M' 'F' ... 'I' 'M' 'I'] 

**************************************************
Y prediction entropy: ['F' 'F' 'M' ... 'I' 'M' 'M'] 



In [30]:
#confusion Matrix
print('confusion matrix gini',confusion_matrix(Y_test,Y_pred_gini),'\n')
print('confusion matrix entropy',confusion_matrix(Y_test,Y_pred_entropy),'\n')

confusion matrix gini [[173  75  88]
 [ 18 274  43]
 [184  92  98]] 

confusion matrix entropy [[146  49 141]
 [ 12 250  73]
 [146  79 149]] 



In [47]:
#accuracy
print('Accuracy score gini',accuracy_score(Y_test,Y_pred_gini)*100,'\n')
print('Accuracy score entropy',accuracy_score(Y_test,Y_pred_entropy)*100,'\n')
acc_decision = round(accuracy_score(Y_test,Y_pred_gini)*100,2)

Accuracy score gini 52.15311004784689 

Accuracy score entropy 52.15311004784689 



In [32]:
#Classification report
print('Classification report gini',classification_report(Y_test,Y_pred_gini))
print('Classification report entropy',classification_report(Y_test,Y_pred_entropy))

Classification report gini               precision    recall  f1-score   support

           F       0.46      0.51      0.49       336
           I       0.62      0.82      0.71       335
           M       0.43      0.26      0.33       374

   micro avg       0.52      0.52      0.52      1045
   macro avg       0.50      0.53      0.51      1045
weighted avg       0.50      0.52      0.50      1045

Classification report entropy               precision    recall  f1-score   support

           F       0.48      0.43      0.46       336
           I       0.66      0.75      0.70       335
           M       0.41      0.40      0.40       374

   micro avg       0.52      0.52      0.52      1045
   macro avg       0.52      0.53      0.52      1045
weighted avg       0.51      0.52      0.52      1045



In [37]:
#logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train,Y_train)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X_train,Y_train)*100,2)
acc_log

54.76

In [38]:
#support vector machine
svc = SVC()
svc.fit(X_train,Y_train)
Y_pred = svc.predict(X_test)
acc_svc = round(svc.score(X_train, Y_train) * 100, 2)
acc_svc

53.99

In [39]:
#KNN
knn = KNeighborsClassifier(n_neighbors = 3)
knn.fit(X_train, Y_train)
Y_pred = knn.predict(X_test)
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
acc_knn

73.66

In [40]:
# Gaussian Naive Bayes

gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
Y_pred = gaussian.predict(X_test)
acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)
acc_gaussian

52.94

In [41]:
# Perceptron

perceptron = Perceptron()
perceptron.fit(X_train, Y_train)
Y_pred = perceptron.predict(X_test)
acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)
acc_perceptron

45.31

In [42]:
# Linear SVC

linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)
Y_pred = linear_svc.predict(X_test)
acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2)
acc_linear_svc

54.85

In [43]:
# Stochastic Gradient Descent

sgd = SGDClassifier()
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)
acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)
acc_sgd

47.67

In [44]:
# Random Forest

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)
random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
acc_random_forest

100.0

In [48]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 'Linear SVC', 
              'Decision Tree'],
    'Score': [acc_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_linear_svc, acc_decision]})
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
3,Random Forest,100.0
1,KNN,73.66
7,Linear SVC,54.85
2,Logistic Regression,54.76
0,Support Vector Machines,53.99
4,Naive Bayes,52.94
8,Decision Tree,52.15
6,Stochastic Gradient Decent,47.67
5,Perceptron,45.31
