In [None]:
import numpy as np
import pandas as pd
import struct
from array import array
from os.path  import join
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.model_selection import train_test_split
from scipy.special import logit, expit
import matplotlib.pyplot as plt 
from matplotlib.pyplot import figure

# Part A: MNIST Dataset

### Dataset Prep

In [None]:
class MnistDataloader(object):
    def __init__(self, training_images_filepath,training_labels_filepath,
                 test_images_filepath, test_labels_filepath):
        self.training_images_filepath = training_images_filepath
        self.training_labels_filepath = training_labels_filepath
        self.test_images_filepath = test_images_filepath
        self.test_labels_filepath = test_labels_filepath
    
    def read_images_labels(self, images_filepath, labels_filepath):        
        labels = []
        with open(labels_filepath, 'rb') as file:
            magic, size = struct.unpack(">II", file.read(8))
            if magic != 2049:
                raise ValueError('Magic number mismatch, expected 2049, got {}'.format(magic))
            labels = array("B", file.read())        
        
        with open(images_filepath, 'rb') as file:
            magic, size, rows, cols = struct.unpack(">IIII", file.read(16))
            if magic != 2051:
                raise ValueError('Magic number mismatch, expected 2051, got {}'.format(magic))
            image_data = array("B", file.read())        
        images = []
        for i in range(size):
            images.append([0] * rows * cols)
        for i in range(size):
            img = np.array(image_data[i * rows * cols:(i + 1) * rows * cols])
            img = img.reshape(28, 28)
            images[i][:] = img            
        
        return images, labels
            
    def load_data(self):
        x_train, y_train = self.read_images_labels(self.training_images_filepath, self.training_labels_filepath)
        x_test, y_test = self.read_images_labels(self.test_images_filepath, self.test_labels_filepath)
        return (x_train, y_train),(x_test, y_test)

In [None]:
input_path = '../input/mnist-dataset'
training_images_filepath = join(input_path, 'train-images-idx3-ubyte/train-images-idx3-ubyte')
training_labels_filepath = join(input_path, 'train-labels-idx1-ubyte/train-labels-idx1-ubyte')
test_images_filepath = join(input_path, 't10k-images-idx3-ubyte/t10k-images-idx3-ubyte')
test_labels_filepath = join(input_path, 't10k-labels-idx1-ubyte/t10k-labels-idx1-ubyte')

mnist_dataloader = MnistDataloader(training_images_filepath, training_labels_filepath, test_images_filepath, test_labels_filepath)
(x_train, y_train), (x_test, y_test) = mnist_dataloader.load_data()

In [None]:
import random
import matplotlib.pyplot as plt
def show_images(images, title_texts):
    cols = 5
    rows = int(len(images)/cols) + 1
    plt.figure(figsize=(30,20))
    index = 1    
    for x in zip(images, title_texts):        
        image = x[0]        
        title_text = x[1]
        plt.subplot(rows, cols, index)        
        plt.imshow(image, cmap=plt.cm.gray)
        if (title_text != ''):
            plt.title(title_text, fontsize = 15);        
        index += 1
        
images_2_show = []
titles_2_show = []
for i in range(0, 10):
    r = random.randint(1, 60000)
    images_2_show.append(x_train[r])
    titles_2_show.append('training image [' + str(r) + '] = ' + str(y_train[r]))    

for i in range(0, 5):
    r = random.randint(1, 10000)
    images_2_show.append(x_test[r])        
    titles_2_show.append('test image [' + str(r) + '] = ' + str(y_test[r]))    

show_images(images_2_show, titles_2_show)

In [None]:
print(np.unique(y_train,return_counts=True))
print(np.unique(y_test,return_counts=True))

In [None]:
class MnistDataProcessor:
    def __init__(self, pos_class, neg_class, training_samples=2000):
        input_path = '../input/mnist-dataset'
        training_images_filepath = join(input_path, 'train-images-idx3-ubyte/train-images-idx3-ubyte')
        training_labels_filepath = join(input_path, 'train-labels-idx1-ubyte/train-labels-idx1-ubyte')
        test_images_filepath = join(input_path, 't10k-images-idx3-ubyte/t10k-images-idx3-ubyte')
        test_labels_filepath = join(input_path, 't10k-labels-idx1-ubyte/t10k-labels-idx1-ubyte')
        mnist_dataloader = MnistDataloader(training_images_filepath, training_labels_filepath, test_images_filepath, test_labels_filepath)
        (self.__x_train, self.__y_train), (self.__x_test, self.__y_test) = mnist_dataloader.load_data()
        self.__train_pos_class = np.where(self.__y_train == np.uint8(pos_class))[0]
        self.__train_neg_class = np.where(self.__y_train == np.uint8(neg_class))[0]
        self.__test_pos_class = np.where(self.__y_test == np.uint8(pos_class))[0]
        self.__test_neg_class = np.where(self.__y_test == np.uint8(neg_class))[0]
        self.__train_samples = training_samples
    
    def train_data(self):
        random.shuffle(self.__train_pos_class)
        random.shuffle(self.__train_neg_class)
        train_positive_class_idx = self.__train_pos_class[:self.__train_samples]
        train_negative_class_idx = self.__train_neg_class[:self.__train_samples]
        _train_vector = np.array([self.__x_train[i] for i in train_positive_class_idx] + [self.__x_train[i] for i in train_negative_class_idx])
        nsamples, nx, ny = _train_vector.shape
        train_vector = _train_vector.reshape((nsamples,nx*ny))
        train_label = [1]*len(train_positive_class_idx) + [-1]*len(train_negative_class_idx)
        return train_vector, train_label
    
    def test_data(self):
        random.shuffle(self.__test_pos_class)
        random.shuffle(self.__test_neg_class)
        _test_vector = np.array([self.__x_test[i] for i in self.__test_pos_class] + [self.__x_test[i] for i in self.__test_neg_class])
        nsamples, nx, ny = _test_vector.shape
        test_vector = _test_vector.reshape((nsamples,nx*ny))
        test_label = [1]*len(self.__test_pos_class) + [-1]*len(self.__test_neg_class)
        return test_vector, test_label

### Classification using linear regression

In [None]:
accuracy_arr = []
dataset = MnistDataProcessor(3, 8)
for i in range(5):
    X,y = dataset.train_data()
    reg = LinearRegression().fit(X, y)
    X_test,y_test = dataset.test_data()
    y_pred = reg.predict(X_test)
    y_pred_1 = [1 if i >= 0 else -1 for i in y_pred ]
    accuracy_arr.append(accuracy_score(y_pred_1, y_test))
list(zip(range(1,6), accuracy_arr))

In [None]:
print("Average accuracy: {}".format(np.mean(accuracy_arr)))
print("Standard deviation: {}".format(np.std(accuracy_arr)))

### Classification using logistic regression

In [None]:
accuracy_arr = []
dataset = MnistDataProcessor(3, 8)
for i in range(5):
    X,y = dataset.train_data()
    reg = LogisticRegression(random_state=0, max_iter = 1000).fit(X, y)
    X_test,y_test = dataset.test_data()
    y_pred = reg.predict(X_test)
    accuracy_arr.append(accuracy_score(y_pred, y_test))
list(zip(range(1,6), accuracy_arr))

In [None]:
print("Average accuracy: {}".format(np.mean(accuracy_arr)))
print("Standard deviation: {}".format(np.std(accuracy_arr)))

# Part B: California housing prices

### Dataset Prep

In [None]:
class HousingDataLoader:
    def __init__(self):
        raw_data = pd.read_csv('../input/california-housing-prices/housing.csv')
        raw_data = raw_data.fillna(1)
        self.raw_data = raw_data.drop(columns=['longitude', 'latitude'])
    
    def get_data(self):
        X_cols = ['housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']
        Y_col = 'median_house_value'
        data = self.raw_data.sample(frac=1).reset_index(drop=True)
        return train_test_split(data[X_cols], data[Y_col], test_size=0.25, random_state=42)

### Median house price prediction using Linear regression

In [None]:
sq_error_arr = []
dataset = HousingDataLoader()
for i in range(5):
    X_train, X_test,y_train, y_test = dataset.get_data()
    reg = LinearRegression().fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    df = pd.DataFrame({'actual': y_test.to_numpy(), 'predicted': y_pred})
    df.plot(kind='line', y=['actual','predicted'], figsize=(20,5), title='Iteration {}'.format(i+1))
    sq_error_arr.append(mean_squared_error(y_test, y_pred))

In [None]:
print(list(zip(range(1,6), sq_error_arr)))
print("Average mean squared error: {}".format(np.mean(sq_error_arr)))
print("Standard deviation: {}".format(np.std(sq_error_arr)))

### Median house price prediction using Logistic regression

In [None]:
def transform_label(arr):
    normalized_label = np.divide(arr, 1000000)
    return logit(normalized_label)

def extract_label(arr):
    exp_label = expit(arr)
    return np.multiply(exp_label, 1000000)

sq_error_arr = []
dataset = HousingDataLoader()
for i in range(5):
    X_train, X_test,y_train, y_test = dataset.get_data()
    reg = LinearRegression().fit(X_train, transform_label(y_train))
    y_pred = extract_label(reg.predict(X_test))
    df = pd.DataFrame({'actual': y_test.to_numpy(), 'predicted': y_pred})
    df.plot(kind='line', y=['actual','predicted'], figsize=(20,5), title='Iteration {}'.format(i+1))
    sq_error_arr.append(mean_squared_error(y_test, y_pred))

In [None]:
print(list(zip(range(1,6), sq_error_arr)))
print("Average mean squared error: {}".format(np.mean(sq_error_arr)))
print("Standard deviation: {}".format(np.std(sq_error_arr)))