# Georgia Tech ISYE6420 Course Project

**Fall 2023  
Spencer Vore**

In [1]:
import pymc
import pandas as pd
from pandas.api.types import is_numeric_dtype

import arviz as az
from collections.abc import Sequence
from typing import Optional

from sklearn.model_selection import train_test_split

### Define adjustable Constants / Settings  
Use this to adjust script settings / MCMC run. Sorry, didn't add it to a .env file or something fancier. Works for purpose of demo / tinkering.

In [2]:
n_all = 500 # Size of data we want for MCMC. Randomly sample this many points from full dataset. Includes train and test sample size.
n_seed = 42 # Random seed for down sampling n datapoints from full dataset

n_test = 50 # Size of test set we want to saomple from n_all.
split_seed = 43 # Random seed used for train / test split

path_to_datafile = "abalone/abalone.data"

### Prepare Dataset

**Load Dataset**

In [3]:
data = pd.read_csv(path_to_datafile, header=None)

In [4]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [5]:
data.shape

(4177, 9)

**Downsample dataset**  
This will help the MCMC algorithm run faster... can adjust upward for better prediction accuracy, or downward for faster computation. Take random sample of data.

In [6]:
# Uncomment this line to use full datasize
# n = data.shape[0]

In [7]:
n = 500
assert n <= data.shape[0] # Can't chose more samples than size of data
data = data.sample(n, random_state=n_seed)

**Split data into training and test sets**

In [8]:
data.shape

(500, 9)

In [9]:
assert n_test < n_all # Test size should be smaller than full size
train_data, test_data = train_test_split(data, test_size=n_test, random_state=split_seed)

In [10]:
train_data.shape

(450, 9)

In [11]:
test_data.shape

(50, 9)

**Standardize train dataset**  
Not sure if this is nessecary, but doesn't hurt and could help.

In [12]:
class DataPrep:
    '''
    Standardize the data. By writing this as an object, we can save the statistics calculated for the standardization
    so we can use them later to predict new datapoints in our model.
    '''
    def __init__(self):
        self.means = dict()
        self.stdevs = dict()
        
    def prep(self, X_df: pd.DataFrame, use_prev_standardization=False) -> pd.DataFrame:
        """
        This takes a dataset, and prepares it. Numerical columns are standardized. Categorical columns 
        are one hot encoded so we can fit a numerical regression with a coefficient for each category.
        
        From TA's example github for numerical standardization
        Standardize input variables by 2 std dev.

        For standardization, see:
        https://stat.columbia.edu/~gelman/research/published/standardizing7.pdf
        https://areding.github.io/6420-pymc/unit7/Unit7-arrhythmia.html
        """
        
        X_standardized = pd.DataFrame()
            
        for colname in X_df.columns:
            col = X_df[colname]
            if is_numeric_dtype(col):
                if not use_prev_standardization:
                    self.means[colname] = col.mean()
                    self.stdevs[colname] = col.std()
                X_standardized[colname] = (col - self.means[colname]) / (2 * self.stdevs[colname])
            else:
                X_onehot = pd.get_dummies(X_df[colname], prefix=colname)
                if X_standardized.empty:
                    X_standardized = X_onehot
                else:
                    X_standardized = X_standardized.join(X_onehot)
                if not use_prev_standardization:
                    onehot_colnames = X_onehot.columns
                    for o_colname in onehot_colnames:
                        self.means[o_colname] = None
                        self.stdevs[o_colname] = None

        return X_standardized


In [13]:
data_preper = DataPrep()
train_data_std = data_preper.prep(train_data)

In [14]:
train_data_std.head()

Unnamed: 0,0_F,0_I,0_M,1,2,3,4,5,6,7,8
3027,False,True,False,-0.076052,-0.115511,-0.033604,-0.303016,-0.298127,-0.259225,-0.08966,-0.278476
2509,False,True,False,-0.423394,-0.436831,-0.625319,-0.480531,-0.42129,-0.442054,-0.528727,-0.587513
3186,True,False,False,-0.198644,-0.214378,-0.362334,-0.337236,-0.271562,-0.240711,-0.388679,-0.587513
166,True,False,False,0.843384,0.848451,0.492365,1.403161,1.003537,1.261258,2.336564,1.575742
2855,True,False,False,0.557337,0.551847,0.229381,0.531626,0.421531,0.772945,0.576511,0.185078


In [15]:
train_data_std.shape

(450, 11)

In [16]:
data_preper.means

{'0_F': None,
 '0_I': None,
 '0_M': None,
 1: 0.5186111111111111,
 2: 0.4033666666666667,
 3: 0.13755555555555554,
 4: 0.81186,
 5: 0.34945,
 6: 0.17900555555555556,
 7: 0.23268777777777777,
 8: 9.802222222222222}

In [17]:
data_preper.stdevs

{'0_F': None,
 '0_I': None,
 '0_M': None,
 1: 0.12235765735250845,
 2: 0.10114516349285094,
 3: 0.03802505067832697,
 4: 0.4675656967761661,
 5: 0.2070427375106454,
 6: 0.10802485934416967,
 7: 0.1320983164834733,
 8: 3.2358645064622427}

**Apply standardization to test dataset**

In [18]:
test_data_std = data_preper.prep(test_data, use_prev_standardization=True)

In [19]:
test_data_std.head()

Unnamed: 0,0_F,0_I,0_M,1,2,3,4,5,6,7,8
308,False,False,True,0.128267,0.032791,-0.165096,-0.054923,-0.23896,-0.071768,-0.14265,0.648633
1034,True,False,False,0.557337,0.279961,0.295127,0.729459,0.821207,0.703516,-0.189964,-0.123958
3637,False,True,False,-0.341667,-0.387397,-0.493827,-0.502453,-0.521511,-0.442054,-0.483306,-0.587513
4139,True,False,False,0.475609,0.452979,0.492365,0.453027,0.413079,0.599373,0.432679,0.03056
865,True,False,False,0.353018,0.354112,0.492365,0.60969,0.626803,0.247602,0.627609,0.03056


In [20]:
test_data_std.shape

(50, 11)

### Fit linear regression model

In [23]:
%load_ext watermark
%watermark -n -u -v -iv -p pytensor

Last updated: Sat Nov 25 2023

Python implementation: CPython
Python version       : 3.11.6
IPython version      : 8.15.0

pytensor: 2.13.1

pymc  : 5.9.1
pandas: 2.1.2
arviz : 0.16.1

