# Georgia Tech ISYE6420 Course Project

Fall 2023
Spencer Vore

In [1]:
import pymc
import pandas as pd
from pandas.api.types import is_numeric_dtype

import arviz as az
from collections.abc import Sequence
from typing import Optional

### Prepare Dataset

Load Dataset

In [2]:
data = pd.read_csv("abalone/abalone.data", header=None)

In [3]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


Standardize dataset - Not sure if this is nessecary, but doesn't hurt and could help.

In [18]:
class DataPrep:
    '''
    Standardize the data. By writing this as an object, we can save the statistics calculated for the standardization
    so we can use them later to predict new datapoints in our model.
    '''
    def __init__(self):
        self.means = dict()
        self.stdevs = dict()
        
    def prep(self, X_df: pd.DataFrame) -> pd.DataFrame:
        """
        This takes a dataset, and prepares it. Numerical columns are standardized. Categorical columns 
        are one hot encoded so we can fit a numerical regression with a coefficient for each category.
        
        From TA's example github for numerical standardization
        Standardize input variables by 2 std dev.

        For standardization, see:
        https://stat.columbia.edu/~gelman/research/published/standardizing7.pdf
        https://areding.github.io/6420-pymc/unit7/Unit7-arrhythmia.html
        """
        
        X_standardized = pd.DataFrame()
            
        for colname in X_df.columns:
            col = X_df[colname]
            if is_numeric_dtype(col):
                self.means[colname] = col.mean()
                self.stdevs[colname] = col.std()
                X_standardized[colname] = (col - self.means[colname]) / (2 * self.stdevs[colname])
            else:
                X_onehot = pd.get_dummies(X_df[colname], prefix=colname)
                print(X_onehot)
                if X_standardized.empty:
                    X_standardized = X_onehot
                else:
                    X_standardized = X_standardized.join(X_onehot)
                print(X_standardized)
                onehot_colnames = X_onehot.columns
                for o_colname in onehot_colnames:
                    self.means[o_colname] = None
                    self.stdevs[o_colname] = None
                # X_standardized[colname] = X_df[colname]
            
        # # find and store means and std, then standardize
        # self.means = X_df.mean(axis=0, numeric_only=True)
        # self.stdevs = X_df.std(axis=0, numeric_only=True)
        # X_standardized = (X_df - self.means) / (2 * self.stdevs)

        return X_standardized
    
    def prep_point(self, x: float) -> float:
        # TODO: Get this working for categorical data and differences in prep function
        return (x - self.means) / (2 * self.stdevs)

In [14]:
data_preper = DataPrep()
data_std = data_preper.prep(data)

0
        0_F    0_I    0_M
0     False  False   True
1     False  False   True
2      True  False  False
3     False  False   True
4     False   True  False
...     ...    ...    ...
4172   True  False  False
4173  False  False   True
4174  False  False   True
4175   True  False  False
4176  False  False   True

[4177 rows x 3 columns]
        0_F    0_I    0_M
0     False  False   True
1     False  False   True
2      True  False  False
3     False  False   True
4     False   True  False
...     ...    ...    ...
4172   True  False  False
4173  False  False   True
4174  False  False   True
4175   True  False  False
4176  False  False   True

[4177 rows x 3 columns]
1
num
2
num
3
num
4
num
5
num
6
num
7
num
8
num


In [15]:
data_std.head()

Unnamed: 0,0_F,0_I,0_M,1,2,3,4,5,6,7,8
0,False,False,True,-0.287245,-0.216049,-0.532148,-0.320911,-0.303806,-0.363062,-0.31907,0.785678
1,False,False,True,-0.724406,-0.719878,-0.591918,-0.615065,-0.585385,-0.602538,-0.606421,-0.454952
2,True,False,False,0.025014,0.061058,-0.053989,-0.154716,-0.231722,-0.178324,-0.103557,-0.144795
3,False,False,True,-0.349696,-0.216049,-0.173529,-0.318871,-0.32408,-0.303763,-0.301111,0.010284
4,False,True,False,-0.807675,-0.770261,-0.711458,-0.635967,-0.607911,-0.643592,-0.660299,-0.454952


In [16]:
data_preper.means

{'0_F': None,
 '0_I': None,
 '0_M': None,
 1: 0.5239920995930094,
 2: 0.40788125448886764,
 3: 0.13951639932966242,
 4: 0.8287421594445774,
 5: 0.35936748862820206,
 6: 0.1805936078525257,
 7: 0.23883085946851804,
 8: 9.933684462532918}

In [17]:
data_preper.stdevs

{'0_F': None,
 '0_I': None,
 '0_M': None,
 1: 0.12009291256479956,
 2: 0.09923986613365945,
 3: 0.041827056607257274,
 4: 0.4903890182309977,
 5: 0.22196294903322014,
 6: 0.10961425025968446,
 7: 0.1392026695223861,
 8: 3.2241690320681284}

### Fit linear regression model

In [None]:
%load_ext watermark
%watermark -n -u -v -iv -p pytensor