In [46]:
"""
GROUP MEMBERS:
  SEAN POSTON
  JONATHAN HARSY
"""
import pandas as pd

In [47]:
# I cleaned the data up to be able to use tabs as the seperator.
data = pd.read_csv('./seeds_dataset.txt', sep='\t', index_col=None, header=None)

# Set column headers
column_headers = ['Area', 'Perimeter', 'Compactness',
                  'Length of Kernel', 'Width of kernel',
                  'Asymmetry Coefficient', 'Length of Kernel Groove', 'Class (1, 2, 3)']

data.columns = column_headers

#pd.set_option('display.max_rows', 10)
data

Unnamed: 0,Area,Perimeter,Compactness,Length of Kernel,Width of kernel,Asymmetry Coefficient,Length of Kernel Groove,"Class (1, 2, 3)"
0,15.26,14.84,0.8710,5.763,3.312,2.221,5.220,1
1,14.88,14.57,0.8811,5.554,3.333,1.018,4.956,1
2,14.29,14.09,0.9050,5.291,3.337,2.699,4.825,1
3,13.84,13.94,0.8955,5.324,3.379,2.259,4.805,1
4,16.14,14.99,0.9034,5.658,3.562,1.355,5.175,1
...,...,...,...,...,...,...,...,...
205,12.19,13.20,0.8783,5.137,2.981,3.631,4.870,3
206,11.23,12.88,0.8511,5.140,2.795,4.325,5.003,3
207,13.20,13.66,0.8883,5.236,3.232,8.315,5.056,3
208,11.84,13.21,0.8521,5.175,2.836,3.598,5.044,3


In [48]:
from sklearn.preprocessing import LabelEncoder

X, y = data.loc[:, :'Length of Kernel Groove'].values, data.loc[:, 'Class (1, 2, 3)'].values

le = LabelEncoder()
y = le.fit_transform(y)
le.classes_
le.transform([1, 2, 3])

array([0, 1, 2], dtype=int64)

In [49]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
  train_test_split(X, y, 
                    test_size = 0.20,
                    stratify = y,
                    random_state = 1)

In [50]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

pipe_lr = make_pipeline(StandardScaler(),
                        PCA(n_components = 2),
                        LogisticRegression(random_state = 1))

pipe_lr.fit(X_train, y_train)
y_pred = pipe_lr.predict(X_test)
print(f'Test Accuracy: {pipe_lr.score(X_test, y_test)}')                        

Test Accuracy: 0.9523809523809523


In [51]:
from sklearn.model_selection import cross_val_score
import numpy as np
# CROSS VALIDATION
scores = cross_val_score(estimator = pipe_lr,
                         X = X_train,
                         y = y_train,
                         cv = 10,
                         n_jobs = 1)
print(f'CV accuracy scores: {scores}')                         
print(f'CV accuracy: {np.mean(scores)} +/- {np.std(scores)}')

CV accuracy scores: [0.82352941 0.94117647 0.88235294 0.94117647 0.94117647 0.82352941
 1.         1.         1.         0.875     ]
CV accuracy: 0.9227941176470587 +/- 0.0652094376700618


In [52]:
from sklearn.model_selection import learning_curve, validation_curve

# LEARNING CURVE
pipe_lr = make_pipeline(StandardScaler(),
                        LogisticRegression(penalty = 'l2', random_state = 1))

train_sizes, train_scores, test_scores = \
                          learning_curve(estimator = pipe_lr,
                                         X = X_train,
                                         y = y_train,
                                         train_sizes = np.linspace(0.1, 1.0, 10),
                                         cv = 10,
                                         n_jobs = 1)

train_mean = np.mean(train_scores, axis = 1)
train_std = np.std(train_scores, axis = 1)

print(train_mean)
print(train_std)

[0.99333333 0.91333333 0.9        0.92166667 0.91466667 0.92333333
 0.92380952 0.93083333 0.94       0.9397351 ]
[0.02       0.02211083 0.01791613 0.01067187 0.01359739 0.01444444
 0.00952381 0.00916667 0.01120944 0.00463576]


In [62]:
# VALIDATION CURVE
param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
train_scores, test_scores = \
                      validation_curve(estimator = pipe_lr,
                            X = X_train,
                            y = y_train,
                            param_name = 'logisticregression__C',
                            param_range = param_range,
                            cv = 2)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1) 

print(train_mean)
print(train_std)

[0.85119048 0.88095238 0.92261905 0.94642857 0.98809524 0.99404762]
[0.01785714 0.03571429 0.0297619  0.00595238 0.01190476 0.00595238]


In [64]:
# GRID SEARCH
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

pipe_svc = make_pipeline(StandardScaler(), SVC(random_state = 1))

param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]

param_grid = [{'svc__C': param_range, 'svc__kernel': ['linear']},
              {'svc__C': param_range, 'svc__gamma': param_range, 'svc__kernel': ['rbf']}]

gs = GridSearchCV(estimator = pipe_svc,
                  param_grid = param_grid,
                  scoring = 'accuracy',
                  cv = 2,
                  n_jobs = -1)

gs = gs.fit(X_train, y_train)
print(f'Best Score: {gs.best_score_}')
print(f'Best Params: {gs.best_params_}')

Best Score: 0.9226190476190477
Best Params: {'svc__C': 1.0, 'svc__gamma': 0.1, 'svc__kernel': 'rbf'}
