### Import

In [40]:
import pandas as pd
from tqdm import tqdm
from itertools import combinations
from sklearn.metrics import mean_squared_error, r2_score

In [41]:
from read import read_datasets, get_subsets
datasets = read_datasets()
list(datasets.keys())

['oversampled_normalized',
 'undersampled_normalized',
 'undersampled',
 'oversampled',
 'normalClass',
 'normalClass_normalized']

### Linear Regression

#### Define functions for running linear regression tests

In [42]:
from sklearn.linear_model import LinearRegression
def run_linearRegression(dataset_name, subset = []):
    assert dataset_name in datasets

    X_train, X_test, y_train, y_test = get_subsets(datasets[dataset_name])
    
    if subset:
        assert any(c in X_train.columns for c in subset)
        X_train = X_train[[c for c in subset if c in X_train.columns]]
        X_test = X_test[[c for c in subset if c in X_test.columns]]
    
    model = LinearRegression()
    model.fit(X_train, y_train)

    coefficients = pd.DataFrame({
        "variable":["INTERCEPT"] + list(X_train.columns),
        "coefficient":[model.intercept_] + list(model.coef_)
    })


    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    return coefficients, y_test, y_pred, mse, r2

In [54]:
def test_diff_datasets(subset = []):
    results = {
        "dataName":[name for name in datasets],
        "featureSubset":[subset for name in datasets],
        "mse":[],
        "r2":[]
    }
    for name in datasets:
        _, _, _, mse, r2 = run_linearRegression(name, subset)
        results["mse"].append(mse)
        results["r2"].append(r2)
    
    return pd.DataFrame(results).sort_values(by = "r2").reset_index(drop = True)

In [None]:
def getCombos(dataName, subset = []):
    X_train, _, _, _ = get_subsets(datasets[dataName])

    cols = list(X_train.columns)
    if subset:
        cols = [c for c in cols if c in subset]
    assert cols

    combos = []
    for i in range(1, len(cols) + 1):
        combos.extend(combinations(cols, i))
    
    return combos


#### Test All Feature Subsets

In [51]:



cols = list(X_train.columns)
combos = []
for i in range(1, len(cols) + 1):
    combos.extend(combinations(cols, i))

dfOut = pd.DataFrame
for combo in tqdm(combos, desc = "Subset Testing"):
    result = test_diff_datasets(combo)
    if dfOut.empty:
        dfOut = result
    else:
        dfOut = pd.concat([dfOut, result])

Subset Testing: 100%|██████████| 2047/2047 [01:25<00:00, 23.91it/s]


In [53]:
bestPerDataName = dfOut.groupby('dataName', as_index=False).apply(lambda x: x.nlargest(1, columns='r2')).reset_index(drop = True)
bestPerDataName

Unnamed: 0,dataName,featureSubset,mse,r2
0,normalClass,"(fixed acidity, volatile acidity, residual sug...",0.375114,0.340628
1,normalClass_normalized,"(fixed acidity, volatile acidity, residual sug...",0.379095,0.333631
2,oversampled,"(fixed acidity, volatile acidity, citric acid,...",0.948022,0.670136
3,oversampled_normalized,"(fixed acidity, volatile acidity, citric acid,...",0.955661,0.667478
4,undersampled,"(fixed acidity, volatile acidity, residual sug...",0.386782,0.791398
5,undersampled_normalized,"(volatile acidity, residual sugar, free sulfur...",1.31085,0.293025


#### Manually Remove Collinearity

In [None]:
dropCols = ["free sulfure dioxide", "ph", "sulfur", "citric acid", "volatile acidity"]
cols = 

### Non-Linear Models

### ANN Pseudo-Regression