### Import

In [55]:
import pandas as pd
from tqdm import tqdm
from itertools import combinations
from sklearn.metrics import mean_squared_error, r2_score

In [56]:
from read import read_datasets, get_subsets
datasets = read_datasets()
list(datasets.keys())

['oversampled_normalized',
 'undersampled_normalized',
 'undersampled',
 'oversampled',
 'normalClass',
 'normalClass_normalized']

### Linear Regression

#### Define functions for running linear regression tests

In [57]:
from sklearn.linear_model import LinearRegression
def run_linearRegression(dataset_name, subset = []):
    assert dataset_name in datasets

    X_train, X_test, y_train, y_test = get_subsets(datasets[dataset_name])
    
    if subset:
        assert any(c in X_train.columns for c in subset)
        X_train = X_train[[c for c in subset if c in X_train.columns]]
        X_test = X_test[[c for c in subset if c in X_test.columns]]
    
    model = LinearRegression()
    model.fit(X_train, y_train)

    coefficients = pd.DataFrame({
        "variable":["INTERCEPT"] + list(X_train.columns),
        "coefficient":[model.intercept_] + list(model.coef_)
    })


    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    return coefficients, y_test, y_pred, mse, r2

In [66]:
def test_diff_datasets(subset = []):
    results = {
        "dataName":[name for name in datasets],
        "featureSubset":[subset for name in datasets],
        "mse":[],
        "r2":[]
    }
    for name in datasets:
        _, _, _, mse, r2 = run_linearRegression(name, subset)
        results["mse"].append(mse)
        results["r2"].append(r2)
    
    return pd.DataFrame(results).sort_values(by = "r2").reset_index(drop = True)

In [69]:
def getCombos(dataName, drop = []):
    X_train, _, _, _ = get_subsets(datasets[dataName])

    cols = list(X_train.columns)
    if drop:
        cols = [c for c in cols if c not in drop]
    assert cols

    combos = []
    for i in range(1, len(cols) + 1):
        combos.extend(combinations(cols, i))
    
    return combos

#### Test All Feature Subsets

In [63]:
dfOut = pd.DataFrame
for combo in tqdm(getCombos("normalClass"), desc = "Subset Testing"):
    result = test_diff_datasets(combo)
    if dfOut.empty:
        dfOut = result
    else:
        dfOut = pd.concat([dfOut, result])

Subset Testing: 100%|██████████| 2047/2047 [02:19<00:00, 14.71it/s]


In [64]:
bestPerDataName = dfOut.groupby('dataName', as_index=False).apply(lambda x: x.nlargest(1, columns='r2')).reset_index(drop = True)
bestPerDataName

Unnamed: 0,dataName,featureSubset,mse,r2
0,normalClass,"(fixed acidity, volatile acidity, residual sug...",0.375114,0.340628
1,normalClass_normalized,"(fixed acidity, volatile acidity, residual sug...",0.379095,0.333631
2,oversampled,"(fixed acidity, volatile acidity, citric acid,...",0.948022,0.670136
3,oversampled_normalized,"(fixed acidity, volatile acidity, citric acid,...",0.955661,0.667478
4,undersampled,"(fixed acidity, volatile acidity, residual sug...",0.386782,0.791398
5,undersampled_normalized,"(volatile acidity, residual sugar, free sulfur...",1.31085,0.293025


#### Manually Remove Collinearity

In [72]:
dropCols = ["free sulfure dioxide", "ph", "sulfur", "citric acid", "volatile acidity"]

dfOut_subset = pd.DataFrame
for combo in tqdm(getCombos("normalClass", dropCols), desc = "Subset Testing"):
    result = test_diff_datasets(combo)
    if dfOut_subset.empty:
        dfOut_subset = result
    else:
        dfOut_subset = pd.concat([dfOut_subset, result])

Subset Testing: 100%|██████████| 511/511 [00:35<00:00, 14.60it/s]


In [73]:
bestPerDataName_subset = dfOut_subset.groupby('dataName', as_index=False).apply(lambda x: x.nlargest(1, columns='r2')).reset_index(drop = True)
bestPerDataName_subset

Unnamed: 0,dataName,featureSubset,mse,r2
0,normalClass,"(fixed acidity, residual sugar, chlorides, tot...",0.400304,0.296351
1,normalClass_normalized,"(fixed acidity, residual sugar, chlorides, tot...",0.40705,0.284492
2,oversampled,"(fixed acidity, residual sugar, chlorides, fre...",1.095498,0.618822
3,oversampled_normalized,"(fixed acidity, residual sugar, chlorides, den...",1.106794,0.614892
4,undersampled,"(fixed acidity, chlorides, free sulfur dioxide...",1.013596,0.453342
5,undersampled_normalized,"(fixed acidity, chlorides, free sulfur dioxide...",1.621071,0.125715


### Read in R Data

In [2]:
library(ISLR)
library(gam)
library(splines)
library(tidyverse)
library(caret)

In [28]:
normalClass <- read.csv("data/normalClass.csv", header = T)
oversamp <- read.csv("data/oversampled.csv", header = T)
undersamp <- read.csv("data/undersampled.csv", header = T)

normalClass_norm <- read.csv("data/normalClass_normalized.csv", header = T)
oversamp_norm <- read.csv("data/oversampled_normalized.csv", header = T)
undersamp_norm <- read.csv("data/undersampled_normalized.csv", header = T)

In [30]:
names(normalClass$train)

In [39]:
getSplit <- function(df, train_or_test){
    retDf <- subset(df, train_test == train_or_test)
    retDf$train_test <- NULL
    return (retDf)
}
normalClass.train <- getSplit(normalClass, "train")
normalClass.test <- getSplit(normalClass, "test")
oversamp.train <- getSplit(oversamp, "train")
oversamp.test <- getSplit(oversamp, "test")
undersamp.train <- getSplit(undersamp, "train")
undersamp.test <- getSplit(undersamp, "test")

normalClass_norm.train <- getSplit(normalClass_norm, "train")
normalClass_norm.test <- getSplit(normalClass_norm, "test")
oversamp_norm.train <- getSplit(oversamp_norm, "train")
oversamp_norm.test <- getSplit(oversamp_norm, "test")
undersamp_norm.train <- getSplit(undersamp_norm, "train")
undersamp_norm.test <- getSplit(undersamp_norm, "test")

### Non-Linear Models

In [42]:
normalClass.train.gam <- gam(quality ~ ., data = normalClass.train)
normalClass.preds <- normalClass.train.gam %>% predict(normalClass.test)

print(data.frame(
  RMSE = RMSE(normalClass.preds, normalClass.test$quality),
  R2 = R2(normalClass.preds, normalClass.test$quality)
))

       RMSE        R2
1 0.6189281 0.3356125


### ANN Pseudo-Regression