In [104]:
"""
GROUP MEMBERS:
  SEAN POSTON
  JONATHAN HARSY
  EVAN WILZBACH
"""
"""
  We wanted to use scraping to find the dataset in JSON format, as the problem stated.
  However, we couldn't find a suitable dataset in JSON format. This being said, we got this JSON
  dataset for the GME stock data, and converted it to a workable DataFrame just to prove 
  that it could be done.
  We also got this other dataset, the Dry Beans one, that is actually workable, but is csv.
  Hopefully we can still get the points for scraping a JSON file.
"""
import pandas as pd
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen

In [105]:
# Download Dry Beans Dataset.zip and convert to DataFrame
resp = urlopen('https://archive.ics.uci.edu/ml/machine-learning-databases/00602/DryBeanDataset.zip')

# Open the downloaded ZipFile and read the contents in it
zipfile = ZipFile(BytesIO(resp.read()))
print(zipfile.namelist())

data = pd.read_excel(zipfile.open('DryBeanDataset/Dry_Bean_Dataset.xlsx'))
data

['DryBeanDataset/', 'DryBeanDataset/Dry_Bean_Dataset.arff', 'DryBeanDataset/Dry_Bean_Dataset.txt', 'DryBeanDataset/Dry_Bean_Dataset.xlsx']


Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRation,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4,Class
0,28395,610.291,208.178117,173.888747,1.197191,0.549812,28715,190.141097,0.763923,0.988856,0.958027,0.913358,0.007332,0.003147,0.834222,0.998724,SEKER
1,28734,638.018,200.524796,182.734419,1.097356,0.411785,29172,191.272750,0.783968,0.984986,0.887034,0.953861,0.006979,0.003564,0.909851,0.998430,SEKER
2,29380,624.110,212.826130,175.931143,1.209713,0.562727,29690,193.410904,0.778113,0.989559,0.947849,0.908774,0.007244,0.003048,0.825871,0.999066,SEKER
3,30008,645.884,210.557999,182.516516,1.153638,0.498616,30724,195.467062,0.782681,0.976696,0.903936,0.928329,0.007017,0.003215,0.861794,0.994199,SEKER
4,30140,620.134,201.847882,190.279279,1.060798,0.333680,30417,195.896503,0.773098,0.990893,0.984877,0.970516,0.006697,0.003665,0.941900,0.999166,SEKER
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13606,42097,759.696,288.721612,185.944705,1.552728,0.765002,42508,231.515799,0.714574,0.990331,0.916603,0.801865,0.006858,0.001749,0.642988,0.998385,DERMASON
13607,42101,757.499,281.576392,190.713136,1.476439,0.735702,42494,231.526798,0.799943,0.990752,0.922015,0.822252,0.006688,0.001886,0.676099,0.998219,DERMASON
13608,42139,759.321,281.539928,191.187979,1.472582,0.734065,42569,231.631261,0.729932,0.989899,0.918424,0.822730,0.006681,0.001888,0.676884,0.996767,DERMASON
13609,42147,763.779,283.382636,190.275731,1.489326,0.741055,42667,231.653248,0.705389,0.987813,0.907906,0.817457,0.006724,0.001852,0.668237,0.995222,DERMASON


In [106]:
# This is not used, as explained above.
# Scrape API data from Nasdaq site
#scraped_data = pd.read_json('https://data.nasdaq.com/api/v3/datasets/WIKI/GME.json?api_key=bEVK-sa9ZMpMhaybyLg9', orient='index') #🚀

# Put NASDAQ JSON data into DataFrame
# table_data = scraped_data.data
# data = pd.DataFrame(table_data[0])
# data.columns = scraped_data.column_names[0]
# data

In [107]:
from sklearn.preprocessing import LabelEncoder

X, y = data.loc[:, 'Area':'ShapeFactor4'].values, data.loc[:, 'Class'].values

le = LabelEncoder()
y = le.fit_transform(y)
le.classes_
le.transform(data['Class'])

array([5, 5, 5, ..., 3, 3, 3])

In [108]:
# Train 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
  train_test_split(X, y, 
                    test_size = 0.20,
                    stratify = y,
                    random_state = 1)

In [109]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

pipe_lr = make_pipeline(StandardScaler(),
                        PCA(n_components = 2),
                        LogisticRegression(random_state = 1))

pipe_lr.fit(X_train, y_train)
y_pred = pipe_lr.predict(X_test)
print(f'Test Accuracy: {pipe_lr.score(X_test, y_test)}')                        

Test Accuracy: 0.8736687477047375


In [114]:
# Cross Val
from sklearn.model_selection import cross_val_score
import numpy as np
# CROSS VALIDATION
scores = cross_val_score(estimator = pipe_lr,
                         X = X_train,
                         y = y_train,
                         cv = 10,
                         n_jobs = 2)
print(f'CV accuracy scores: {scores}')                         
print(f'CV accuracy: {np.mean(scores)} +/- {np.std(scores)}')

CV accuracy scores: [0.92011019 0.92653811 0.91827365 0.9164371  0.93204775 0.92653811
 0.94123049 0.92929293 0.91084559 0.92003676]
CV accuracy: 0.9241350672500405 +/- 0.008344085407396562


In [115]:
# Learning and Validation Curve
from sklearn.model_selection import learning_curve, validation_curve

# LEARNING CURVE
pipe_lr = make_pipeline(StandardScaler(),
                        LogisticRegression(penalty = 'l2', random_state = 1))

train_sizes, train_scores, test_scores = \
                          learning_curve(estimator = pipe_lr,
                                         X = X_train,
                                         y = y_train,
                                         train_sizes = np.linspace(0.1, 1.0, 10),
                                         cv = 10,
                                         n_jobs = 2)

train_mean = np.mean(train_scores, axis = 1)
train_std = np.std(train_scores, axis = 1)

print(train_mean)
print(train_std)

[0.92911134 0.92052067 0.91816945 0.91798928 0.9213513  0.92437489
 0.92558682 0.92564103 0.92418642 0.92512501]
[0.00429009 0.00294171 0.00121017 0.00303681 0.00109182 0.00129587
 0.00152381 0.0013566  0.00133898 0.00088738]


In [121]:
# VALIDATION CURVE
param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
train_scores, test_scores = \
                      validation_curve(estimator = pipe_lr,
                            X = X_train,
                            y = y_train,
                            param_name = 'logisticregression__C',
                            param_range = param_range,
                            cv = 2,
                            n_jobs = 2)

train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1) 

print(train_mean)
print(train_std)

[0.86572373 0.9171565  0.92432035 0.9252388  0.92634093 0.92560617]
[0.00202057 0.00440852 0.0031227  0.00293902 0.0031227  0.00293902]


In [124]:
# GRID SEARCH
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

pipe_svc = make_pipeline(StandardScaler(), SVC(random_state = 1))

param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]

param_grid = [{'svc__C': param_range, 'svc__kernel': ['linear']},
              {'svc__C': param_range, 'svc__gamma': param_range, 'svc__kernel': ['rbf']}]

gs = GridSearchCV(estimator = pipe_svc,
                  param_grid = param_grid,
                  scoring = 'accuracy',
                  cv = 2,
                  n_jobs = 2)

gs = gs.fit(X_train, y_train)
print(f'Best Score: {gs.best_score_}')
print(f'Best Params: {gs.best_params_}')

Best Score: 0.9279941219691403
Best Params: {'svc__C': 100.0, 'svc__gamma': 0.01, 'svc__kernel': 'rbf'}
