In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install --upgrade scikit-learn==0.22
!pip install mlrose
!pip install cairocffi
!pip install CairoSVG
!pip install pygal
!pip install sklearn

In [None]:
import logging
import pygal
import mlrose
import mlrose.decay
import numpy as np
import random
import time
import itertools
import sklearn.metrics
import sklearn.preprocessing
import sklearn.datasets

from collections import defaultdict

import argparse
import inspect
import logging
import sys

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import glob

print('done')

In [None]:
def partition(datasets, probs):
    """Splits data into two partitions.
    
    Args:
        datasets: Any number of datasets of the same size. These will be
            partitioned together, so if the first element of one dataset goes
            to the right then the first element of all the other datasets will
            do the same.
        probs: The probability that the data is split into each of the buckets.
        
    Returns:
        A number of buckets containing some portion of the datasets according
        to probs.
    """
    buckets = [[] for _ in probs]
    for item in zip(*datasets):
        coin = random.uniform(0, 1)
        ptotal = 0
        for bucket_index, pcurr in enumerate(probs):
            ptotal += pcurr
            if coin < ptotal:
                break
        else:
            continue
        buckets[bucket_index].append(item)
    return [list(zip(*d)) for d in buckets]


class MedianBinarizer:
    """Bins data as: <med = 0 and >med = 1."""
    def __init__(self):
        self._binarizer = None

    def fit(self, x):
        median = np.median(x)
        self._binarizer = sklearn.preprocessing.Binarizer(median)\
            .fit([[i] for i in x])
        return self

    def transform(self, x):
        if self._binarizer is None:
            raise TypeError("fit has not been called")
        return self._binarizer.transform([[i] for i in x])


def make_boolean(data):
    """Transforms labels into two classes (>avg, <avg)."""
    samples, labels = data
    average = sum(labels) / len(labels)
    labels = [1 if label > average else -1 for label in labels]
    return samples, labels


PLOTFUNCS = {}


def register_plotfunc(prefix, x_title, plotfunc):
    PLOTFUNCS["%s_%s" % (prefix, x_title)] = (plotfunc, x_title)

def get_plotfuncs():
    return list(PLOTFUNCS.keys())


def get_plotfunc(plotfunc):
    return PLOTFUNCS[plotfunc][0]


def get_xtitle(plotfunc):
    return PLOTFUNCS[plotfunc][1]


def run_timer():
    def backend():
        curr = None
        prev = time.time()
        while True:
            curr = time.time()
            yield curr - prev
            prev = curr
    timer = backend()
    next(timer)
    return timer

class Plotter:
    def __init__(self, xtitle):
        self.lerr = []
        self.terr = []
        self.ftimes = []
        self.stimes = []
        self.xtitle = xtitle
        
    @property
    def learning_plot(self):
        plot = pygal.XY(
            #stroke=False,
            x_title=self.xtitle,
            y_title="error")
        plot.add("training", self.lerr)
        plot.add("testing", self.terr)
        return plot

    @property
    def fit_timing_plot(self):
        plot = pygal.XY(
            #stroke=False,
            show_legend=False,
            x_title=self.xtitle,
            y_title="fit_time")
        plot.add("", self.ftimes)
        return plot

    @property
    def score_timing_plot(self):
        plot = pygal.XY(
            #stroke=False,
            show_legend=False,
            x_title=self.xtitle,
            y_title="score_time")
        plot.add("", self.stimes)
        return plot
    
    def plot(self, classifier, xval, ldata, tdata):
        logging.info("plotting data point %s...", xval)
        timer = run_timer()
        classifier.fit(*ldata)
        self.ftimes.append((xval, next(timer)))

        lx, ly = ldata
        lscore = 1 - sklearn.metrics.accuracy_score(ly, classifier.predict(lx))
        self.lerr.append((xval, lscore))
        tx, ty = tdata
        tscore = 1 - sklearn.metrics.accuracy_score(ty, classifier.predict(tx))
        self.terr.append((xval, tscore))
        self.stimes.append((xval, next(timer)))

    def write(self, outdir, name):
        def write_plot(plot, suffix=""):
            plot.title = (name + suffix).replace('_',' ')
            #plot.render_to_file(outdir + "%s%s.svg" % (name, suffix))
            plot.render_to_png(outdir + "%s%s.png" % (name, suffix))

        write_plot(self.learning_plot)
        write_plot(self.fit_timing_plot, "_ftime")
        write_plot(self.score_timing_plot, "_stime")
        
print ('common defines done')

In [None]:
ITER_MIN = 500
ITER_MAX = 10_000
ITER_STEP = 500


def iterfunc(algorithm, factor=1, nodes=None):
    def plotfunc(ldata, tdata, plotter):
        for max_iter in range(
                int(ITER_MIN / factor),
                int(ITER_MAX / factor + 1),
                int(ITER_STEP / factor)):
            classifier = mlrose.NeuralNetwork(
                nodes or [10],
                algorithm=algorithm,
                max_iters=max_iter)
            plotter.plot(classifier, max_iter, ldata, tdata)
    return plotfunc


NODE_MIN = 1
NODE_MAX = 20
NODE_STEP = 1

def nodefunc(algorithm, factor=1):
    def plotfunc(ldata, tdata, plotter):
        for nodes in range(NODE_MIN, NODE_MAX + 1, NODE_STEP):
            classifier = mlrose.NeuralNetwork(
                [nodes],
                algorithm=algorithm,
                max_iters=4000 / factor)
            plotter.plot(classifier, nodes, ldata, tdata)
    return plotfunc


DELTA_MIN = .02
DELTA_MAX = .5
DELTA_STEP = .02


def deltafunc(algorithm):
    def plotfunc(ldata, tdata, plotter):
        delta = DELTA_MIN
        while delta < DELTA_MAX + 1e-8:
            classifier = mlrose.NeuralNetwork(
                [10],
                algorithm=algorithm,
                max_iters=4000,
                learning_rate=delta)
            plotter.plot(classifier, delta, ldata, tdata)
            delta += DELTA_STEP
    return plotfunc

TEMP_MIN = .5
TEMP_MAX = 10
TEMP_STEP = .5

def tempfunc():
    def plotfunc(ldata, tdata, plotter):
        temp = TEMP_MIN
        while temp < TEMP_MAX + 1e-8:
            classifier = mlrose.NeuralNetwork(
                    [10],
                    algorithm="simulated_annealing",
                    max_iters=2000,
                    schedule=mlrose.decay.GeomDecay(temp))
            plotter.plot(classifier, temp, ldata, tdata)
            temp += TEMP_STEP
    return plotfunc


hillclimb_iter = iterfunc("random_hill_climb")
hillclimb_node = nodefunc("random_hill_climb")
annealing_iter = iterfunc("simulated_annealing")
annealing_node = nodefunc("simulated_annealing")
genetic_iter = iterfunc("genetic_alg", 50)
genetic_node = nodefunc("genetic_alg", 50)

annealing_iter_2 = iterfunc("simulated_annealing", nodes=[2])
annealing_iter_10 = iterfunc("simulated_annealing", nodes=[10])
annealing_iter_20 = iterfunc("simulated_annealing", nodes=[20])

hillclimb_delta = deltafunc("random_hill_climb")
annealing_delta = deltafunc("random_hill_climb")

annealing_temp = tempfunc()
    
print ('neural net defines done')

In [None]:
df = None
df_x = None
df_y = None
to_encode = []

df_asteroid = pd.read_csv('../input/asteroid-dataset/dataset.csv')
    
inputs = df_asteroid

inputs.pha.replace(('Y', 'N'), (1, 0), inplace = True)
inputs['pha'] = inputs['pha'].fillna(0)
inputs['pha'] = inputs.pha.astype(int)

inputs.neo.replace(('Y', 'N'), (1, 0), inplace = True)
inputs['neo'] = inputs['neo'].fillna(0)
inputs['neo'] = inputs.neo.astype(int)

inputs = inputs.drop(['id', 'spkid', 'full_name', 'name', 'prefix', 'orbit_id', 'pdes', 'equinox', 'diameter', 'albedo', 
                      'diameter_sigma'], axis='columns')
inputs.dropna(inplace=True)
target = inputs['class']

df = inputs
inputs1 = inputs.drop(['class'], axis='columns')
df_x = inputs1.columns
df_y = 'class'
to_encode = inputs.columns
        
print(df)
le = sklearn.preprocessing.LabelEncoder
encoderDict = defaultdict(le)
for column in to_encode:
    print('Encoding: ' + column)
    df[column] = df[column].dropna()
    df = df[df[column].notnull()]
    df[column] = encoderDict[column].fit_transform(df[column])
    print(encoderDict[column].classes_)

print(df_x)
print(df_y)

print(df.columns)
print(df.dtypes)

df = df.head(5000)

df = df.dropna()  

df = df.sample(frac=1).reset_index(drop=True)
print('Dataset size: ' + str(df.size))
print('Features: ' + str(df_x))
print('Target Decision: ' + df_y)

x1 = df.loc[:, df_x]
y1 = df.loc[:, df_y]
print('ready with x1 and y1')

In [None]:
x1 = x1.to_numpy()
y1 = y1.to_numpy()
print('dataframe to array')

In [None]:
(lx1, ly1), (tx1, ty1) = partition((x1, y1), (.8, .2))
scaler = sklearn.preprocessing.StandardScaler().fit(lx1)
lx1 = scaler.transform(lx1)
tx1 = scaler.transform(tx1)

binarizer = MedianBinarizer().fit(ly1)
ly1 = binarizer.transform(ly1)
ty1 = binarizer.transform(ty1)
print('ready with ly1 and ty1')

In [None]:
np.seterr(over="ignore")
register_plotfunc("neuralnet_hillclimb", "max_iter", hillclimb_iter)
register_plotfunc("neuralnet_hillclimb", "hidden_nodes", hillclimb_node)
register_plotfunc("neuralnet_annealing", "max_iter", annealing_iter)
register_plotfunc("neuralnet_annealing", "hidden_nodes", annealing_node)
register_plotfunc("neuralnet_genetic", "max_iter", genetic_iter)
register_plotfunc("neuralnet_genetic", "hidden_nodes", genetic_node)
register_plotfunc(
    "neuralnet_annealing_2node", "max_iter", annealing_iter_2)
register_plotfunc(
    "neuralnet_annealing_10node", "max_iter", annealing_iter_10)
register_plotfunc(
    "neuralnet_annealing_20node", "max_iter", annealing_iter_20)
register_plotfunc("neuralnet_hillclimb", "delta", hillclimb_delta)
register_plotfunc("neuralnet_annealing", "delta", annealing_delta)
register_plotfunc("neuralnet_annealing", "init_temp", annealing_temp)
print('register complete')

In [None]:
ldata = (lx1, ly1)
tdata = (tx1, ty1)
print('ready for part1 and part2 from here')

In [None]:
plotfuncs = ['neuralnet_nodes','hillclimb_max_iter', 'hillclimb_hidden_nodes', 'annealing_max_iter', 'annealing_hidden_nodes', 
             'genetic_max_iter', 'genetic_hidden_nodes', 'annealing_2node_max_iter', 'annealing_10node_max_iter', 
             'annealing_20node_max_iter', 'hillclimb_delta', 'annealing_delta', 'annealing_init_temp']
print('ready with plotfuncs')

In [None]:
if "neuralnet_nodes" in plotfuncs:
    plotfuncs.remove("neuralnet_nodes")
print('check for neuralnodes for the next step')

In [None]:
funcs = [
    get_plotfunc("neuralnet_annealing_2node_max_iter"),
    get_plotfunc("neuralnet_annealing_10node_max_iter"),
    get_plotfunc("neuralnet_annealing_20node_max_iter")]
titles = ["2 nodes", "10 nodes", "20 nodes"]
plots = [Plotter("max_iter") for _ in range(3)]

for func, plot in zip(funcs, plots):
   func(ldata, tdata, plot)
splot = pygal.XY(x_title="max_iter", y_title="score")
tplot = pygal.XY(x_title="max_iter", y_title="fit_time")
for plotter, title in zip(plots, titles):
    print('plotting for ' + title)
    splot.add(title + " training", plotter.lerr)
    splot.add(title + " testing", plotter.terr)
    tplot.add(title, plotter.ftimes)
splot.title = "neuralnet nodes"
tplot.title = "neuralnet nodes ftime"
#splot.render_to_file("%sneuralnet_nodes.svg" % ("",))
splot.render_to_png("%sneuralnet_nodes.png" % ("",))
#tplot.render_to_file("%sneuralnet_nodes_ftime.svg" % ("",))
tplot.render_to_png("%sneuralnet_nodes_ftime.png" % ("",))
print('done with splot and tplot')

In [None]:
for plotfunc in plotfuncs:
    plot = get_plotfunc('neuralnet_' + plotfunc)
    xtitle = get_xtitle('neuralnet_' + plotfunc)
    
    print("plotting " + plotfunc + "...")
    plotter = Plotter(xtitle)
    try:
        plot(ldata, tdata, plotter)
    except KeyboardInterrupt:
        print("caught keyboard interrupt. plotting and continuing.")
        plotter.write(plotfunc, plotfunc)
        continue
    except Exception:
        print("error in " + plotfunc + ". continuing...")
        continue
    plotter.write('', plotfunc)
print ('done with plotting')

In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import glob

for image_path in glob.glob("/kaggle/working/*.png"):
    img = mpimg.imread(image_path)
    plt.ion()
    plt.figure()
    plt.axis('off') 
    plt.imshow(img)
    plt.show()
    plt.close()