In [1]:
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.impute import KNNImputer

housing = pd.read_csv(r'C:\Users\sbsla\Downloads\archive (9)\housing.csv')

In [2]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [91]:
len(housing)

20640

In [3]:
X = housing.drop("median_house_value", axis=1)
y = np.array(housing["median_house_value"])

In [4]:
#apply OHE to ocean_proximity column
X_enc = pd.get_dummies(X, columns=["ocean_proximity"])

#Impute missing values using KNN
imputer = KNNImputer(n_neighbors=5)
imputer.fit(X_enc)
X_enc = imputer.transform(X_enc)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X_enc, y, test_size=0.2)
mlp = MLPRegressor(hidden_layer_sizes=(5), max_iter=1000)
mlp2 = MLPRegressor(hidden_layer_sizes=(5), max_iter=1000)

In [17]:
mlp.fit(X_train, y_train)
mlp2.fit(X_train, y_train)



MLPRegressor(hidden_layer_sizes=10, max_iter=1000)

In [18]:
#Test error
print(mean_squared_error(y_test, mlp.predict(X_test)))
print(mean_squared_error(y_test, mlp2.predict(X_test)))

6982301636.153901
6609839758.07687


In [39]:
#Train error
mean_squared_error(y_train, mlp.predict(X_train))

4220246032.0314903

In [40]:
#Baseline error
mean_squared_error(np.mean(y_train)*np.ones(len(y_test)), y_test)

13412603787.759983

In [41]:
mlp.predict(X_test)

array([159932.25245589, 111513.4017089 , 375868.59874881, ...,
       145611.38776664, 281493.3777842 ,  66674.60962033])

In [6]:
from torch.utils.data import random_split, SubsetRandomSampler, Subset
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import ttest_ind, wilcoxon, mannwhitneyu
import pandas as pd
from tqdm import tqdm
from deepsig import aso, bootstrap_test, permutation_test
from random import sample
import random
CONSIDERED_TESTS = {
    "ASO": lambda a, b: aso(a, b, show_progress=False),
    "Student's t": lambda a, b: ttest_ind(a, b, equal_var=False, alternative="greater")[
        1
    ],
    "Bootstrap": lambda a, b: bootstrap_test(a, b),
    "Permutation": lambda a, b: permutation_test(a, b),
    "Wilcoxon": lambda a, b: wilcoxon(a, b, alternative="greater").pvalue,
    "Mann-Whitney U": lambda a, b: mannwhitneyu(a, b, alternative="greater").pvalue,
    }

SAMPLE_SIZES = [5, 10, 15, 20]

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
def get_subsample(full_data_X, full_data_y):
    #Takes in full data, returns subsampled train and validate sets
    subsample_indices = random.sample(range(len(full_data_y)), k=round(0.1 * len(full_data_y)))
    subsample_X = full_data_X[subsample_indices]
    subsample_y = full_data_y[subsample_indices]
    subsample_train_X, subsample_valid_X, subsample_train_y, subsample_valid_y = train_test_split(subsample_X, subsample_y, test_size=0.2)
    return subsample_train_X, subsample_train_y, subsample_valid_X, subsample_valid_y

In [8]:
def bootstrap_scores(model1, model2, subsample_train_X, subsample_train_y, subsample_valid_X, subsample_valid_y):
    #Takes in models and data, performs 40 bootstrap samples (20 for each model)
    #Returns two lists of scores (list of list)
    scores1 = []
    for i in range(0,20):
        bootstrap_indices = random.choices(range(len(subsample_train_X)), k=len(subsample_train_X))
        boot_X = subsample_train_X[bootstrap_indices]
        boot_y = subsample_train_y[bootstrap_indices]
        model1.fit(boot_X, boot_y)
        score = mean_squared_error(subsample_valid_y, model1.predict(subsample_valid_X))
        scores1.append(score)
    scores2 = []
    for i in range(0,20):
        bootstrap_indices = random.choices(range(len(subsample_train_X)), k=len(subsample_train_X))
        boot_X = subsample_train_X[bootstrap_indices]
        boot_y = subsample_train_y[bootstrap_indices]
        model2.fit(boot_X, boot_y)
        score = mean_squared_error(subsample_valid_y, model2.predict(subsample_valid_X))
        scores2.append(score)
    scores = [scores1, scores2]
    return scores

In [9]:
def compute_stats(scores, stats_dict):
    #Takes in sets of scores, performs various statistical testing 
    #Output is 2d array, type of test on one axis and sample size on the other
    for samp_size in SAMPLE_SIZES:
        scores1 = sample(scores[0],samp_size)
        scores2 = sample(scores[1],samp_size)
        samp_size_scores = []
        iter = 0
        for key in CONSIDERED_TESTS:
            pval = CONSIDERED_TESTS[key](scores1, scores2)
            stats_dict[samp_size][iter].append(pval)
            iter += 1
        
    return stats_dict

In [10]:
def make_statistics(nloops, model1, model2, full_data_X, full_data_y):
    #full_data_X and full_data_y must be two numpy arrays
    stats_dict = {
    5: [[], [], [], [], [], []],
    10: [[], [], [], [], [], []],
    15: [[], [], [], [], [], []],
    20: [[], [], [], [], [], []]
    }
    for i in range(0,nloops):
        subsample_train_X, subsample_train_y, subsample_valid_X, subsample_valid_y = get_subsample(full_data_X, full_data_y)
        scores = bootstrap_scores(model1, model2, subsample_train_X, subsample_train_y, subsample_valid_X, subsample_valid_y)
        stats_dict = compute_stats(scores, stats_dict)
    return(stats_dict)

In [39]:
model1 = MLPRegressor(hidden_layer_sizes=(5,5), max_iter=1000)
model2 = MLPRegressor(hidden_layer_sizes=(5), max_iter=1000)

In [20]:
#Test with two iterations
make_statistics(2,mlp,mlp2,X_enc,y)



{5: [[0.4309709516015063, 0.814826001978206],
  [0.14095869856040108, 0.4188020278810875],
  [0.081, 0.442],
  [0.14885114885114886, 0.3876123876123876],
  [0.15625, 0.5],
  [0.1111111111111111, 0.5]],
 10: [[1.0, 1.0],
  [0.7469653333015063, 0.7409439226626517],
  [0.773, 0.759],
  [0.7152847152847153, 0.7272727272727273],
  [0.7216796875, 0.65234375],
  [0.6612075210237622, 0.7397385583621137]],
 15: [[1.0, 1.0],
  [0.833022252687924, 0.7275018884230222],
  [0.844, 0.724],
  [0.7952047952047953, 0.7442557442557443],
  [0.755645751953125, 0.737762451171875],
  [0.8192518169645508, 0.8854855166876527]],
 20: [[1.0, 1.0],
  [0.8588626725684418, 0.5232730455826506],
  [0.855, 0.527],
  [0.913086913086913, 0.4965034965034965],
  [0.8919162750244141, 0.4491586685180664],
  [0.8175791726956705, 0.6014012903654127]]}