In [1]:
import pandas as pd
import numpy as np
import os
import time
import random
import sympy as sp
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn.model_selection import train_test_split
plt.style.use('seaborn-whitegrid')
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [2]:
# Mount Google drive folder
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


In [3]:
# Setup working directory and data directory
DIR = '/content/drive/My Drive/Colab Notebooks/EvenSem2021MAHE/Classifiers'
DATA_DIR = DIR + '/Data/SimulatedData/'
os.chdir(DIR)

In [4]:
# Function to simulate data
def GenerateSimulatedData(num_samples = 9000, num_features = 2, num_labels = 3):
    X = np.zeros((num_features, num_samples)) # data matrix (each column = single sample)
    y = np.zeros(num_samples, dtype = 'uint8') # class labels
    for j in range(num_labels):
      ix = range((num_samples//num_labels)*j, (num_samples//num_labels)*(j+1))
      r = np.linspace(0.0, 1.0, num_samples//num_labels) # radius
      t = np.linspace(j*4, (j+1)*4, num_samples//num_labels) +\
       np.random.randn(num_samples//num_labels)*0.2 # theta
      X[:, ix] = np.vstack((r*np.sin(t), r*np.cos(t)))
      y[ix] = j

    return X, y

In [5]:
# Simulate data (this will later change to loading data from the DATA_DIR folder)
num_samples = 300
num_features = 2
num_labels = 3
X, y =  GenerateSimulatedData(num_samples, num_features, num_labels)

In [6]:
# Bias trick: append the bias dimension of ones to the data matrix so that the
# classifier can deal with a single weight matrix W.
X = np.vstack([X, np.ones((1, num_samples))])
num_features += 1

In [7]:
# Split data into train-validation-test sets
train_proportion = 0.7
validate_proportion = 0.2
idx = random.sample(np.arange(num_samples).tolist(), num_samples)
X_train, X_validate, X_test = \
  np.split(X[:, idx],
           [int(train_proportion*num_samples),
            int((train_proportion+validate_proportion)*num_samples)],
           axis = 1)

y_train, y_validate, y_test = \
  np.split(y[idx], [int(train_proportion*num_samples),
            int((train_proportion+validate_proportion)*num_samples)])

print('Training data size = %d'%(X_train.shape[1]))
print('Validation data size = %d'%(X_validate.shape[1]))
print('Test data size = %d'%(X_test.shape[1]))

Training data size = 210
Validation data size = 60
Test data size = 30


In [8]:
from LinearClassifiers.linear_classifier import *

In [9]:
softmaxclassifierObject = Softmax()
#softmaxclassifierObject = LinearSVM()
loss_hist = softmaxclassifierObject.train(X_train, y_train, learning_rate = 1e-0,
                                          reg = 1e-3, num_iters = 300,
                                          batch_size = 128, verbose = True)

Iteration: 0 / 300, Loss: 1.101371
Iteration: 10 / 300, Loss: 0.893127
Iteration: 20 / 300, Loss: 0.806388
Iteration: 30 / 300, Loss: 0.792027
Iteration: 40 / 300, Loss: 0.782225
Iteration: 50 / 300, Loss: 0.770568
Iteration: 60 / 300, Loss: 0.724405
Iteration: 70 / 300, Loss: 0.756020
Iteration: 80 / 300, Loss: 0.730247
Iteration: 90 / 300, Loss: 0.693905
Iteration: 100 / 300, Loss: 0.763590
Iteration: 110 / 300, Loss: 0.740124
Iteration: 120 / 300, Loss: 0.779818
Iteration: 130 / 300, Loss: 0.688423
Iteration: 140 / 300, Loss: 0.762572
Iteration: 150 / 300, Loss: 0.745677
Iteration: 160 / 300, Loss: 0.738659
Iteration: 170 / 300, Loss: 0.723538
Iteration: 180 / 300, Loss: 0.764119
Iteration: 190 / 300, Loss: 0.806379
Iteration: 200 / 300, Loss: 0.761869
Iteration: 210 / 300, Loss: 0.795389
Iteration: 220 / 300, Loss: 0.704899
Iteration: 230 / 300, Loss: 0.719670
Iteration: 240 / 300, Loss: 0.753902
Iteration: 250 / 300, Loss: 0.712212
Iteration: 260 / 300, Loss: 0.720083
Iteration: 2

In [None]:
# A useful debugging strategy is to plot the loss as a function of
# iteration number:
plt.plot(loss_hist)
plt.xlabel('Iteration number')
plt.ylabel('Loss value')
plt.show()

In [None]:
# Evaluate performance on both the training and validation sets
y_train_pred = softmaxclassifierObject.predict(X_train)
print('training accuracy: %f' % (np.mean(y_train == y_train_pred), ))
y_validate_pred = softmaxclassifierObject.predict(X_validate)
print('validation accuracy: %f' % (np.mean(y_validate == y_validate_pred), ))

In [None]:
# Get the learned weights and bias
W_learned = softmaxclassifierObject.W[:, :-1]
print('Learned weights =')
print(W_learned)
b_learned = softmaxclassifierObject.W[:, -1].reshape(num_labels,-1)
print('Learned bias =')
print(b_learned)

# Function to extract correct category for each point (treated as a sample) in the 2D plane 
f = lambda x1, x2: np.argmax(np.dot(W_learned, np.array([x1, x2])) + b_learned, axis = 0)

# Generate 2D grid of points
x1, x2 = np.mgrid[np.min(X[0, :]) - 1:np.max(X[0, :]) + 1:1000j,
                  np.min(X[1, :]) - 1:np.max(X[1, :]) + 1:1000j]

# Calculate category corresponding to each point on the grid
S = f(x1.flatten(), x2.flatten()).reshape(x1.shape)

fig, ax = plt.subplots(1, 1, figsize = (6,6))
fig.tight_layout(pad = 4.0)
ax.contourf(x1, x2, S, cmap = plt.cm.Spectral, alpha = 0.8)
ax.scatter(X[0, :], X[1, :], c = y, s = 40, cmap = plt.cm.Spectral)
plt.xlim(x1.min(), x1.max())
plt.ylim(x2.min(), x2.max())
#fig.savefig('spiral_linear.png')
