# Model

Plan:

- Develop a model to predict property value
- Use drivers identified in explore to build predictive regression models
- Create and run a baseline model with sklearn's `DummyRegressor` to compare our results to
- Create and run `Linear Regression`, `LassoLars`, and Polynomial regression models
- Use the insights from the highest-performing model (with highest test RMSE) to confirm our initial hypotheses and insights on the features that are the biggest drivers of property value

In [65]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import PolynomialFeatures

from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, LassoLars
from sklearn.cluster import KMeans


from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score

from sklearn.preprocessing import MinMaxScaler

from wrangle import split_data


import warnings
warnings.filterwarnings("ignore")


## Preprocessing before Clustering

Features: `['alcohol', 'volatile acidity', 'chlorides']`

Scale features:
- MinMax

Before scaling, split data

In [2]:
df = pd.read_csv('wine_data.csv') 
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,red
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,1
4,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5,1


#

In [3]:
train, validate, test = split_data(df, random_state=123)

In [4]:
scaler = MinMaxScaler()
train_sc = pd.concat([pd.DataFrame(data=scaler.fit_transform(train.drop(columns=['quality'])),
                                   columns=train.drop(columns=['quality']).columns),
                      train[['quality']].reset_index().iloc[:,1]],
                      axis=1)
validate_sc = pd.concat([pd.DataFrame(data=scaler.transform(validate.drop(columns=['quality'])),
                                   columns=validate.drop(columns=['quality']).columns),
                         validate[['quality']].reset_index().iloc[:,1]],
                         axis=1)
test_sc = pd.concat([pd.DataFrame(data=scaler.transform(test.drop(columns=['quality'])),
                                   columns=test.drop(columns=['quality']).columns),
                     test[['quality']].reset_index().iloc[:,1]],
                     axis=1)

In [5]:
def x_y_split(train, validate, test, target):
    '''
    Takes in train, validate, test, and target data and returns x-y split versions of train, validate, and test
    '''
    # remove target
    X_train = train.drop(columns=target)
    X_validate = validate.drop(columns=target)
    X_test = test.drop(columns=target)

    # only add target
    y_train = train[target]
    y_validate = validate[target]
    y_test = test[target]
    
    return X_train, y_train, X_validate, y_validate, X_train, y_train

In [6]:
X_train, y_train, X_validate, y_validate, X_train, y_train = x_y_split(train_sc, validate_sc, test_sc, 'quality')

In [16]:
y_train.shape

(3724,)

In [7]:
X_validate.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,red
0,0.429752,0.306667,0.295181,0.074194,0.136895,0.138889,0.239631,0.478223,0.27907,0.205714,0.173913,1.0
1,0.413223,0.106667,0.325301,0.06129,0.118531,0.083333,0.117512,0.482536,0.51938,0.165714,0.173913,1.0
2,0.338843,0.153333,0.13253,0.409677,0.0601,0.15625,0.474654,0.530832,0.581395,0.194286,0.289855,0.0
3,0.22314,0.146667,0.192771,0.045161,0.053422,0.114583,0.193548,0.204398,0.503876,0.245714,0.434783,0.0
4,0.256198,0.146667,0.216867,0.125806,0.070117,0.104167,0.453917,0.345839,0.527132,0.182857,0.347826,0.0


In [8]:
best_relationships = [
['fixed acidity', 'chlorides', 'alcohol'],
['fixed acidity', 'density', 'alcohol'],
['citric acid', 'chlorides', 'alcohol'],
['citric acid', 'pH', 'alcohol'],
['residual sugar', 'free sulfur dioxide', 'alcohol'],
['volatile acidity', 'chlorides', 'alcohol'],
['fixed acidity', 'free sulfur dioxide', 'alcohol'],
['volatile acidity', 'free sulfur dioxide', 'alcohol'],
['alcohol', 'pH', 'volatile acidity'],
['alcohol', 'sulphates', 'free sulfur dioxide'],
['alcohol', 'fixed acidity', 'pH'],
['alcohol', 'fixed acidity', 'residual sugar'],
['fixed acidity', 'citric acid', 'sulphates'],
['alcohol', 'fixed acidity', 'volatile acidity'],
['alcohol', 'citric acid', 'volatile acidity']
]

In [143]:
def build_regression_models(X_train, y_train, X_validate, y_validate, target):
    """
    Builds an OLS model, 5 LassoLars models, and 3 polynomial models and outputs them in a dataframe that highlights the good and bad performers with a legend to understand what each color means
    """
    # Ad hoc fixes
    y_train = pd.DataFrame(y_train)
    y_train['baseline'] = y_train['quality'].mean()
    X_train.columns = X_train.columns.astype(str)
    
#     y_validate = pd.DataFrame(y_validate)
#     y_validate['baseline'] = y_validate['quality'].mean()
#     X_validate = X_validate.columns.astype(str)
    
#     df['volatile acidity'] = pd.to_numeric(df['volatile acidity'], errors='coerce')
#     X_validate['volatile acidity'] = X_validate['volatile acidity'].astype(float)


    
    # Calculate the RMSE for the baseline model
    rmse_train_mu = mean_squared_error(y_train[target], y_train.baseline) ** 0.5
    rmse_validate_mu = mean_squared_error(y_validate[target], y_validate.baseline) ** 0.5

    # Initialize a dataframe to store the evaluation metrics and add the baseline RMSE
    results_df = pd.DataFrame(data=[{
    'model': 'Mean Baseline',
    'rmse_train': round(rmse_train_mu,2),
    'rmse_validate': round(rmse_validate_mu),
    'r_validate': round(explained_variance_score(y_validate[target], y_validate.baseline),2),
    'model_difference': round(abs(rmse_validate_mu - rmse_train_mu),2)
    }])
    
    # Build, evaluate, and append the OLS model to the results dataframe
    
    OLSmodel = LinearRegression()
    OLSmodel.fit(X_train, y_train[target])
    y_train['value_pred_ols'] = OLSmodel.predict(X_train)
    rmse_train = mean_squared_error(y_train[target], y_train.value_pred_ols) ** 0.5
    y_validate['value_pred_ols'] = OLSmodel.predict(X_validate)
    rmse_validate = mean_squared_error(y_validate[target], y_validate.value_pred_ols) ** 0.5
    results_df = results_df.append({
        'model': 'OLS',
        'rmse_train': round(rmse_train,2),
        'rmse_validate': round(rmse_validate,2),
        'r_validate': round(explained_variance_score(y_validate[target], y_validate.value_pred_ols),2),
        'model_difference': round(abs(rmse_validate - rmse_train),2)
    }, ignore_index=True)
    
    # Build, evaluate, and append 5 LassoLars models to the results dataframe
    for i in range(1, 6):
        #Make the thing
        lassomodel = LassoLars(alpha=i/100)
        #Fit the thing
        lassomodel.fit(X_train, y_train[target])
        #Use the thing
        y_train['value_pred_lasso'] = lassomodel.predict(X_train)
        #Calculate performance metrics
        rmse_train = mean_squared_error(y_train[target], y_train.value_pred_lasso) ** 0.5
        #Repeat steps on the validate set
        y_validate['value_pred_lasso'] = lassomodel.predict(X_validate)
        rmse_validate = mean_squared_error(y_validate[target], y_validate.value_pred_lasso) ** 0.5
        results_df = results_df.append({
            'model': 'LassoLars_a' + str(i/100),
            'rmse_train': round(rmse_train,2),
            'rmse_validate': round(rmse_validate,2),
            'r_validate': round(explained_variance_score(y_validate[target], y_validate.value_pred_lasso),2),
            'model_difference': abs(rmse_validate - rmse_train)
        }, ignore_index=True)
    
    # Build, evaluate, and append 3 Polynomial models to the results dataframe
    for i in range(2, 5):
        poly = PolynomialFeatures(degree=i, include_bias=False)
        X_train_poly = poly.fit_transform(X_train)
        X_validate_poly = poly.transform(X_validate)
        poly_reg = LinearRegression()
        poly_reg.fit(X_train_poly, y_train[target])
        y_train['value_pred_poly'] = poly_reg.predict(X_train_poly)
        rmse_train = mean_squared_error(y_train[target], y_train.value_pred_poly) ** 0.5
        y_validate['value_pred_poly'] = poly_reg.predict(X_validate_poly)
        rmse_validate = mean_squared_error(y_validate[target], y_validate.value_pred_poly) ** 0.5
        results_df = results_df.append({
            'model': 'Polynomial_deg' + str(i),
            'rmse_train': round(rmse_train,2),
            'rmse_validate': round(rmse_validate,2),
            'r_validate': round(explained_variance_score(y_validate[target], y_validate.value_pred_poly),2),
            'model_difference': round(abs(rmse_validate - rmse_train),2)
        }, ignore_index=True)
    
    
    # create styled_df
    lowest_differences = results_df.model_difference.sort_values()[:3].tolist()
    styled_df = apply_highlights(results_df, 'model_difference', lowest_differences)


    # styling variables for legend string
    HIGHLIGHT_YELLOW = '\033[1;33;93m'  
    HIGHLIGHT_RED = '\033[1;33;91m'
    HIGHLIGHT_GREEN = '\033[1;33;92m'
    HIGHLIGHT_END = '\033[0m;'


#     print(f"""\n
# 10 regression models were successfully generated.
    
# {HIGHLIGHT_YELLOW}Yellow: the three models with the lowest train-validate difference (green may overlap with these and leave only two yellow rows){HIGHLIGHT_END}\n{HIGHLIGHT_RED}Red: the baseline plus all models within one percent of the baseline rmse{HIGHLIGHT_END}\n{HIGHLIGHT_GREEN}Green: either the model with the lowest train-validate difference or a model with a very low train-validate difference and the lowest validate rmse{HIGHLIGHT_END}""")
#     display(styled_df)
#     print("\nExamine the above models and determine the best one for your purposes.\nGenerally, a green model will perform the best.\n")
    
    return results_df#, styled_df

In [39]:
############ APPLY HIGHLIGHTS FUNCTION ###############

def apply_highlights(df, column_name, value_list):
    """
    A helper function used in the build_regression_models function to stylize the results_df dataframe to highlight poor performing models in red, good models in yellow, and great models in green.
    """
    
    # Create a Styler object for the DataFrame
    styled_df = df.style

    # Create a boolean mask to identify rows with values in the specified list
    mask = df[column_name].isin(value_list)
    
    # variables to calculate red rows
    baseline = df.rmse_validate[0]
    one_percent = df.rmse_validate[0] * .01
    
    # variables to calculate green rows
    lowest_difference_val = df.model_difference.sort_values().head(1).tolist()[0]
    rmse_of_lowest_diff_val = df.groupby('model_difference').min().head(1).rmse_validate.tolist()[0]
    
    # Apply the desired styling to the rows in the mask
    styled_df = styled_df.apply(lambda row: ['background-color: yellow'] * len(row) if row[column_name] in value_list else [''] * len(row), axis=1)
    
    styled_df = styled_df.apply(lambda row: ['background-color: limegreen'] * len(row) if (row.model_difference <= value_list[0] + (value_list[0] * .000001)) and (row.rmse_validate <= rmse_of_lowest_diff_val) else [''] * len(row), axis=1)
    
    styled_df = styled_df.apply(lambda row: ['background-color: red'] * len(row) if (row.rmse_validate > baseline - one_percent) and (row.rmse_validate < baseline + one_percent) else [''] * len(row), axis=1)


    return styled_df



In [200]:
def build_regression_models(X_train, y_train, X_validate, y_validate, target):
    """
    Builds an OLS model, 5 LassoLars models, and 3 polynomial models and outputs them in a dataframe that highlights the good and bad performers with a legend to understand what each color means
    """
    # Ad hoc fixes
    y_train = pd.DataFrame(y_train)
    y_train['baseline'] = y_train[target].mean()
    
    y_validate = pd.DataFrame(y_validate)
    y_validate['baseline'] = y_validate[target].mean()
   
    y_validate = pd.DataFrame(y_validate)
    
    
    # Calculate the RMSE for the baseline model
    rmse_train_mu = mean_squared_error(y_train[target], y_train.baseline) ** 0.5
    rmse_validate_mu = mean_squared_error(y_validate[target], y_validate.baseline) ** 0.5

    # Initialize a dataframe to store the evaluation metrics and add the baseline RMSE
    results_df = pd.DataFrame(data=[{
    'model': 'Mean Baseline',
    'rmse_train': round(rmse_train_mu,2),
    'rmse_validate': round(rmse_validate_mu),
    'r_validate': round(explained_variance_score(y_validate[target], y_validate.baseline),2),
    'model_difference': round(abs(rmse_validate_mu - rmse_train_mu),2)
    }])
    
    # Build, evaluate, and append the OLS model to the results dataframe
    OLSmodel = LinearRegression()
    OLSmodel.fit(X_train, y_train[target])
    y_train['value_pred_ols'] = OLSmodel.predict(X_train)
    rmse_train = mean_squared_error(y_train[target], y_train.value_pred_ols) ** 0.5
    y_validate['value_pred_ols'] = OLSmodel.predict(X_validate)
    rmse_validate = mean_squared_error(y_validate[target], y_validate.value_pred_ols) ** 0.5
    results_df = results_df.append({
        'model': 'OLS',
        'rmse_train': round(rmse_train,2),
        'rmse_validate': round(rmse_validate,2),
        'r_validate': round(explained_variance_score(y_validate[target], y_validate.value_pred_ols),2),
        'model_difference': round(abs(rmse_validate - rmse_train),2)
    }, ignore_index=True)
    
    # Build, evaluate, and append 5 LassoLars models to the results dataframe
    for i in range(1, 6):
        #Make the thing
        lassomodel = LassoLars(alpha=i/100)
        #Fit the thing
        lassomodel.fit(X_train, y_train[target])
        #Use the thing
        y_train['value_pred_lasso'] = lassomodel.predict(X_train)
        #Calculate performance metrics
        rmse_train = mean_squared_error(y_train[target], y_train.value_pred_lasso) ** 0.5
        #Repeat steps on the validate set
        y_validate['value_pred_lasso'] = lassomodel.predict(X_validate)
        rmse_validate = mean_squared_error(y_validate[target], y_validate.value_pred_lasso) ** 0.5
        results_df = results_df.append({
            'model': 'LassoLars_a' + str(i/100),
            'rmse_train': round(rmse_train,2),
            'rmse_validate': round(rmse_validate,2),
            'r_validate': round(explained_variance_score(y_validate[target], y_validate.value_pred_lasso),2),
            'model_difference': abs(rmse_validate - rmse_train)
        }, ignore_index=True)
    
    # Build, evaluate, and append 3 Polynomial models to the results dataframe
    for i in range(2, 5):
        poly = PolynomialFeatures(degree=i, include_bias=False)
        X_train_poly = poly.fit_transform(X_train)
        X_validate_poly = poly.transform(X_validate)
        # Make the thing
        poly_reg = LinearRegression()
        # Fit the thing
        poly_reg.fit(X_train_poly, y_train[target])
        # Use the thing
        y_train['value_pred_poly'] = poly_reg.predict(X_train_poly)
        rmse_train = mean_squared_error(y_train[target], y_train.value_pred_poly) ** 0.5
        y_validate['value_pred_poly'] = poly_reg.predict(X_validate_poly)
        rmse_validate = mean_squared_error(y_validate[target], y_validate.value_pred_poly) ** 0.5
        results_df = results_df.append({
            'model': 'Polynomial_deg' + str(i),
            'rmse_train': round(rmse_train,2),
            'rmse_validate': round(rmse_validate,2),
            'r_validate': round(explained_variance_score(y_validate[target], y_validate.value_pred_poly),2),
            'model_difference': round(abs(rmse_validate - rmse_train),2)
        }, ignore_index=True)
    
    
    # create styled_df
    lowest_differences = results_df.model_difference.sort_values()[:3].tolist()
    styled_df = apply_highlights(results_df, 'model_difference', lowest_differences)


    # styling variables for legend string
    HIGHLIGHT_YELLOW = '\033[1;33;93m'  
    HIGHLIGHT_RED = '\033[1;33;91m'
    HIGHLIGHT_GREEN = '\033[1;33;92m'
    HIGHLIGHT_END = '\033[0m;'


#     print(f"""\n
# 10 regression models were successfully generated.
    
# {HIGHLIGHT_YELLOW}Yellow: the three models with the lowest train-validate difference (green may overlap with these and leave only two yellow rows){HIGHLIGHT_END}\n{HIGHLIGHT_RED}Red: the baseline plus all models within one percent of the baseline rmse{HIGHLIGHT_END}\n{HIGHLIGHT_GREEN}Green: either the model with the lowest train-validate difference or a model with a very low train-validate difference and the lowest validate rmse{HIGHLIGHT_END}""")
#     display(styled_df)
#     print("\nExamine the above models and determine the best one for your purposes.\nGenerally, a green model will perform the best.\n")
    
    return results_df#, styled_df

In [209]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

def train_validate_regression_model(X_train, y_train, X_validate, y_validate, best_relationships, i):
    '''
    Takes in train and validate data with a list of features and evaluates an OLS model.
    '''
    #
    predictive_features = ['volatile acidity', 'alcohol', 'density', 'chlorides', 'cluster']
    X_train = X_train[predictive_features]
    X_validate = X_validate[predictive_features]
    
    dummies = pd.get_dummies(X_train.cluster, drop_first=True)
    dummies = dummies.rename(columns={1: '1', 2:'2', 3:'3'})
    X_train = pd.concat([dummies, X_train], axis=1)
    X_train = X_train.drop(columns='cluster')

    dummies = pd.get_dummies(X_validate.cluster, drop_first=True)
    dummies = dummies.rename(columns={1: '1', 2:'2', 3:'3'})
    X_validate = pd.concat([dummies, X_validate], axis=1)
    X_validate = X_validate.drop(columns='cluster')
    
    # Initialize a results DataFrame to store model performance metrics
    results = pd.DataFrame()

    # Define the model name, each model name corresponds to the index of the set of features in the best_relationships list it was clustered off of
    model = 'OLS' + str(i)


    no_cluster_performance = build_regression_models(X_train.drop(columns=['1','2', '3']), y_train, X_validate.drop(columns=['1','2','3']), y_validate, 'quality')
    cluster_performance = build_regression_models(X_train, y_train, X_validate, y_validate, 'quality')
    
    results = pd.concat([no_cluster_performance, cluster_performance], axis=0)
    
#     ########## TRAIN RUN ############
#     # Initialize and fit the Linear Regression model on the training data with cluster information
#     lm = LinearRegression().fit(X_train, y_train)
#     # Initialize and fit the Linear Regression model on the training data without cluster information
#     lm_1 = LinearRegression().fit(X_train.drop(columns=['1', '2']), y_train)
    
#     # Evaluate cluster performance metrics on the training data
#     train_cluster_rmse = mean_squared_error(y_train, lm.predict(X_train), squared=False)
#     train_cluster_r2 = r2_score(y_train, lm.predict(X_train))
#     # Evaluate non-cluster performance metrics on the training data
#     train_rmse = mean_squared_error(y_train, lm.predict(X_train), squared=False)
#     train_r2 = r2_score(y_train, lm_1.predict(X_train.drop(columns=['1', '2'])))
                                
#     ########## VALIDATE RUN ############
#     # Evaluate performance metrics of the dataframe with cluster
#     validate_cluster_rmse = mean_squared_error(y_validate, lm.predict(X_validate), squared=False)
#     validate_cluster_r2 = r2_score(y_validate, lm.predict(X_validate))

#     validate_rmse = mean_squared_error(y_validate, lm_1.predict(X_validate.drop(columns=['1', '2'])), squared=False)
#     validate_r2 = r2_score(y_validate, lm_1.predict(X_validate.drop(columns=['1', '2'])))
    
#     ######## APPEND RESULTS ############
#     # Append the model performance metrics to the results DataFrame
#     results = results.append({
#         'model': model,
#         'train_rmse': train_rmse,
#         'train_r2': train_r2,
#         'validate_rmse': validate_rmse,
#         'validate_r2': validate_r2
#     }, ignore_index=True)

#     results = results.append({
#         'model': model + '_cluster',
#         'train_rmse': train_cluster_rmse,
#         'train_r2': train_cluster_r2,
#         'validate_rmse': validate_cluster_rmse,
#         'validate_r2': validate_cluster_r2
#     }, ignore_index=True)
    return results


In [210]:
def cluster_then_regress(X_train, y_train, X_validate, y_validate, best_relationships):
    '''
    Builds and evaluates a series of cluster models, then evaluates their predictive power in a series of 
    regression models with tuned hyperparameters. Outputs a dataframe of the results.
    '''
    results_df = pd.DataFrame()
    for i in range(len(best_relationships)):
        # Initiate the dataframe
        X = X_train[best_relationships[i]]
        X_v = X_validate[best_relationships[i]]
        
        model = 'model_' + str(i)
        
        # Make, fit the model
        kmeans = KMeans(n_clusters=4)
        kmeans.fit(X)

        # Make predictions and append to X_train
        X_train['cluster'] = kmeans.predict(X)
        X_validate['cluster'] = kmeans.predict(X_v)
        
        # Run the regression model
        results_df = results_df.append(train_validate_regression_model(X_train, y_train, X_validate, y_validate, best_relationships[i], i))
        print(f'Cluster {i + 1} Performance')
        display(results_df)
        print(results_df.sort_values('rmse_validate').iloc[0])
    # Return the results
    return results_df

In [211]:
cluster_then_regress(X_train, y_train, X_validate, y_validate, best_relationships)

Unnamed: 0,volatile acidity,alcohol,density,chlorides
0,0.160000,0.405797,0.252264,0.043406
1,0.060000,0.420290,0.266925,0.035058
2,0.146667,0.173913,0.366106,0.070117
3,0.153333,0.565217,0.206123,0.043406
4,0.120000,0.637681,0.260457,0.043406
...,...,...,...,...
3719,0.033333,0.463768,0.143596,0.035058
3720,0.046667,0.347826,0.245364,0.058431
3721,0.200000,0.869565,0.160845,0.060100
3722,0.126667,0.637681,0.153083,0.055092


Unnamed: 0,volatile acidity,alcohol,density,chlorides
0,0.306667,0.173913,0.478223,0.136895
1,0.106667,0.173913,0.482536,0.118531
2,0.153333,0.289855,0.530832,0.060100
3,0.146667,0.434783,0.204398,0.053422
4,0.146667,0.347826,0.345839,0.070117
...,...,...,...,...
793,0.313333,0.333333,0.529970,0.131886
794,0.260000,0.806763,0.203105,0.060100
795,0.100000,0.246377,0.409228,0.043406
796,0.106667,0.202899,0.271238,0.066778


Unnamed: 0,1,2,3,volatile acidity,alcohol,density,chlorides
0,0,1,0,0.160000,0.405797,0.252264,0.043406
1,0,1,0,0.060000,0.420290,0.266925,0.035058
2,0,0,0,0.146667,0.173913,0.366106,0.070117
3,1,0,0,0.153333,0.565217,0.206123,0.043406
4,1,0,0,0.120000,0.637681,0.260457,0.043406
...,...,...,...,...,...,...,...
3719,0,1,0,0.033333,0.463768,0.143596,0.035058
3720,0,1,0,0.046667,0.347826,0.245364,0.058431
3721,1,0,0,0.200000,0.869565,0.160845,0.060100
3722,1,0,0,0.126667,0.637681,0.153083,0.055092


Unnamed: 0,1,2,3,volatile acidity,alcohol,density,chlorides
0,0,0,1,0.306667,0.173913,0.478223,0.136895
1,0,0,0,0.106667,0.173913,0.482536,0.118531
2,0,0,0,0.153333,0.289855,0.530832,0.060100
3,0,1,0,0.146667,0.434783,0.204398,0.053422
4,0,1,0,0.146667,0.347826,0.345839,0.070117
...,...,...,...,...,...,...,...
793,0,0,1,0.313333,0.333333,0.529970,0.131886
794,1,0,0,0.260000,0.806763,0.203105,0.060100
795,0,0,0,0.100000,0.246377,0.409228,0.043406
796,0,0,0,0.106667,0.202899,0.271238,0.066778


Cluster 1 Performance


Unnamed: 0,model,rmse_train,rmse_validate,r_validate,model_difference
0,Mean Baseline,0.88,1.0,0.0,0.01
1,OLS,0.74,0.74,0.27,0.01
2,LassoLars_a0.01,0.75,0.75,0.26,0.002832
3,LassoLars_a0.02,0.77,0.77,0.22,0.000701
4,LassoLars_a0.03,0.79,0.78,0.19,0.002552
5,LassoLars_a0.04,0.8,0.8,0.16,0.003466
6,LassoLars_a0.05,0.82,0.82,0.12,0.004358
7,Polynomial_deg2,0.74,0.75,0.27,0.01
8,Polynomial_deg3,0.73,2.19,-5.32,1.46
9,Polynomial_deg4,0.72,2.99,-10.84,2.27


model                OLS
rmse_train          0.74
rmse_validate       0.74
r_validate          0.27
model_difference    0.01
Name: 1, dtype: object


Unnamed: 0,volatile acidity,alcohol,density,chlorides
0,0.160000,0.405797,0.252264,0.043406
1,0.060000,0.420290,0.266925,0.035058
2,0.146667,0.173913,0.366106,0.070117
3,0.153333,0.565217,0.206123,0.043406
4,0.120000,0.637681,0.260457,0.043406
...,...,...,...,...
3719,0.033333,0.463768,0.143596,0.035058
3720,0.046667,0.347826,0.245364,0.058431
3721,0.200000,0.869565,0.160845,0.060100
3722,0.126667,0.637681,0.153083,0.055092


Unnamed: 0,volatile acidity,alcohol,density,chlorides
0,0.306667,0.173913,0.478223,0.136895
1,0.106667,0.173913,0.482536,0.118531
2,0.153333,0.289855,0.530832,0.060100
3,0.146667,0.434783,0.204398,0.053422
4,0.146667,0.347826,0.345839,0.070117
...,...,...,...,...
793,0.313333,0.333333,0.529970,0.131886
794,0.260000,0.806763,0.203105,0.060100
795,0.100000,0.246377,0.409228,0.043406
796,0.106667,0.202899,0.271238,0.066778


Unnamed: 0,1,2,3,volatile acidity,alcohol,density,chlorides
0,0,0,0,0.160000,0.405797,0.252264,0.043406
1,0,0,0,0.060000,0.420290,0.266925,0.035058
2,0,1,0,0.146667,0.173913,0.366106,0.070117
3,0,0,1,0.153333,0.565217,0.206123,0.043406
4,0,0,1,0.120000,0.637681,0.260457,0.043406
...,...,...,...,...,...,...,...
3719,0,0,0,0.033333,0.463768,0.143596,0.035058
3720,0,0,0,0.046667,0.347826,0.245364,0.058431
3721,0,0,1,0.200000,0.869565,0.160845,0.060100
3722,0,0,1,0.126667,0.637681,0.153083,0.055092


Unnamed: 0,1,2,3,volatile acidity,alcohol,density,chlorides
0,0,1,0,0.306667,0.173913,0.478223,0.136895
1,0,1,0,0.106667,0.173913,0.482536,0.118531
2,0,1,0,0.153333,0.289855,0.530832,0.060100
3,0,0,0,0.146667,0.434783,0.204398,0.053422
4,0,0,0,0.146667,0.347826,0.345839,0.070117
...,...,...,...,...,...,...,...
793,1,0,0,0.313333,0.333333,0.529970,0.131886
794,0,0,1,0.260000,0.806763,0.203105,0.060100
795,0,1,0,0.100000,0.246377,0.409228,0.043406
796,0,1,0,0.106667,0.202899,0.271238,0.066778


Cluster 2 Performance


Unnamed: 0,model,rmse_train,rmse_validate,r_validate,model_difference
0,Mean Baseline,0.88,1.0,0.0,0.01
1,OLS,0.74,0.74,0.27,0.01
2,LassoLars_a0.01,0.75,0.75,0.26,0.002832
3,LassoLars_a0.02,0.77,0.77,0.22,0.000701
4,LassoLars_a0.03,0.79,0.78,0.19,0.002552
5,LassoLars_a0.04,0.8,0.8,0.16,0.003466
6,LassoLars_a0.05,0.82,0.82,0.12,0.004358
7,Polynomial_deg2,0.74,0.75,0.27,0.01
8,Polynomial_deg3,0.73,2.19,-5.32,1.46
9,Polynomial_deg4,0.72,2.99,-10.84,2.27


model                OLS
rmse_train          0.74
rmse_validate       0.74
r_validate          0.27
model_difference    0.01
Name: 1, dtype: object


Unnamed: 0,volatile acidity,alcohol,density,chlorides
0,0.160000,0.405797,0.252264,0.043406
1,0.060000,0.420290,0.266925,0.035058
2,0.146667,0.173913,0.366106,0.070117
3,0.153333,0.565217,0.206123,0.043406
4,0.120000,0.637681,0.260457,0.043406
...,...,...,...,...
3719,0.033333,0.463768,0.143596,0.035058
3720,0.046667,0.347826,0.245364,0.058431
3721,0.200000,0.869565,0.160845,0.060100
3722,0.126667,0.637681,0.153083,0.055092


Unnamed: 0,volatile acidity,alcohol,density,chlorides
0,0.306667,0.173913,0.478223,0.136895
1,0.106667,0.173913,0.482536,0.118531
2,0.153333,0.289855,0.530832,0.060100
3,0.146667,0.434783,0.204398,0.053422
4,0.146667,0.347826,0.345839,0.070117
...,...,...,...,...
793,0.313333,0.333333,0.529970,0.131886
794,0.260000,0.806763,0.203105,0.060100
795,0.100000,0.246377,0.409228,0.043406
796,0.106667,0.202899,0.271238,0.066778


Unnamed: 0,1,2,3,volatile acidity,alcohol,density,chlorides
0,0,0,0,0.160000,0.405797,0.252264,0.043406
1,0,0,0,0.060000,0.420290,0.266925,0.035058
2,1,0,0,0.146667,0.173913,0.366106,0.070117
3,0,1,0,0.153333,0.565217,0.206123,0.043406
4,0,1,0,0.120000,0.637681,0.260457,0.043406
...,...,...,...,...,...,...,...
3719,0,0,0,0.033333,0.463768,0.143596,0.035058
3720,0,0,0,0.046667,0.347826,0.245364,0.058431
3721,0,1,0,0.200000,0.869565,0.160845,0.060100
3722,0,1,0,0.126667,0.637681,0.153083,0.055092


Unnamed: 0,1,2,3,volatile acidity,alcohol,density,chlorides
0,0,0,1,0.306667,0.173913,0.478223,0.136895
1,0,0,1,0.106667,0.173913,0.482536,0.118531
2,1,0,0,0.153333,0.289855,0.530832,0.060100
3,0,0,0,0.146667,0.434783,0.204398,0.053422
4,0,0,0,0.146667,0.347826,0.345839,0.070117
...,...,...,...,...,...,...,...
793,1,0,0,0.313333,0.333333,0.529970,0.131886
794,0,1,0,0.260000,0.806763,0.203105,0.060100
795,1,0,0,0.100000,0.246377,0.409228,0.043406
796,1,0,0,0.106667,0.202899,0.271238,0.066778


Cluster 3 Performance


Unnamed: 0,model,rmse_train,rmse_validate,r_validate,model_difference
0,Mean Baseline,0.88,1.0,0.0,0.01
1,OLS,0.74,0.74,0.27,0.01
2,LassoLars_a0.01,0.75,0.75,0.26,0.002832
3,LassoLars_a0.02,0.77,0.77,0.22,0.000701
4,LassoLars_a0.03,0.79,0.78,0.19,0.002552
5,LassoLars_a0.04,0.8,0.8,0.16,0.003466
6,LassoLars_a0.05,0.82,0.82,0.12,0.004358
7,Polynomial_deg2,0.74,0.75,0.27,0.01
8,Polynomial_deg3,0.73,2.19,-5.32,1.46
9,Polynomial_deg4,0.72,2.99,-10.84,2.27


model                OLS
rmse_train          0.74
rmse_validate       0.74
r_validate          0.27
model_difference    0.01
Name: 1, dtype: object


Unnamed: 0,volatile acidity,alcohol,density,chlorides
0,0.160000,0.405797,0.252264,0.043406
1,0.060000,0.420290,0.266925,0.035058
2,0.146667,0.173913,0.366106,0.070117
3,0.153333,0.565217,0.206123,0.043406
4,0.120000,0.637681,0.260457,0.043406
...,...,...,...,...
3719,0.033333,0.463768,0.143596,0.035058
3720,0.046667,0.347826,0.245364,0.058431
3721,0.200000,0.869565,0.160845,0.060100
3722,0.126667,0.637681,0.153083,0.055092


Unnamed: 0,volatile acidity,alcohol,density,chlorides
0,0.306667,0.173913,0.478223,0.136895
1,0.106667,0.173913,0.482536,0.118531
2,0.153333,0.289855,0.530832,0.060100
3,0.146667,0.434783,0.204398,0.053422
4,0.146667,0.347826,0.345839,0.070117
...,...,...,...,...
793,0.313333,0.333333,0.529970,0.131886
794,0.260000,0.806763,0.203105,0.060100
795,0.100000,0.246377,0.409228,0.043406
796,0.106667,0.202899,0.271238,0.066778


Unnamed: 0,1,2,3,volatile acidity,alcohol,density,chlorides
0,0,0,1,0.160000,0.405797,0.252264,0.043406
1,0,0,0,0.060000,0.420290,0.266925,0.035058
2,0,1,0,0.146667,0.173913,0.366106,0.070117
3,1,0,0,0.153333,0.565217,0.206123,0.043406
4,1,0,0,0.120000,0.637681,0.260457,0.043406
...,...,...,...,...,...,...,...
3719,0,0,1,0.033333,0.463768,0.143596,0.035058
3720,0,0,1,0.046667,0.347826,0.245364,0.058431
3721,1,0,0,0.200000,0.869565,0.160845,0.060100
3722,1,0,0,0.126667,0.637681,0.153083,0.055092


Unnamed: 0,1,2,3,volatile acidity,alcohol,density,chlorides
0,0,1,0,0.306667,0.173913,0.478223,0.136895
1,0,1,0,0.106667,0.173913,0.482536,0.118531
2,0,0,0,0.153333,0.289855,0.530832,0.060100
3,0,0,0,0.146667,0.434783,0.204398,0.053422
4,0,0,0,0.146667,0.347826,0.345839,0.070117
...,...,...,...,...,...,...,...
793,0,0,0,0.313333,0.333333,0.529970,0.131886
794,1,0,0,0.260000,0.806763,0.203105,0.060100
795,0,1,0,0.100000,0.246377,0.409228,0.043406
796,0,1,0,0.106667,0.202899,0.271238,0.066778


Cluster 4 Performance


Unnamed: 0,model,rmse_train,rmse_validate,r_validate,model_difference
0,Mean Baseline,0.88,1.00,0.00,0.010000
1,OLS,0.74,0.74,0.27,0.010000
2,LassoLars_a0.01,0.75,0.75,0.26,0.002832
3,LassoLars_a0.02,0.77,0.77,0.22,0.000701
4,LassoLars_a0.03,0.79,0.78,0.19,0.002552
...,...,...,...,...,...
5,LassoLars_a0.04,0.80,0.79,0.17,0.004660
6,LassoLars_a0.05,0.80,0.80,0.16,0.005052
7,Polynomial_deg2,0.73,0.74,0.29,0.000000
8,Polynomial_deg3,0.72,2.39,-6.55,1.670000


model               Polynomial_deg2
rmse_train                     0.73
rmse_validate                  0.74
r_validate                     0.27
model_difference               0.01
Name: 7, dtype: object


Unnamed: 0,volatile acidity,alcohol,density,chlorides
0,0.160000,0.405797,0.252264,0.043406
1,0.060000,0.420290,0.266925,0.035058
2,0.146667,0.173913,0.366106,0.070117
3,0.153333,0.565217,0.206123,0.043406
4,0.120000,0.637681,0.260457,0.043406
...,...,...,...,...
3719,0.033333,0.463768,0.143596,0.035058
3720,0.046667,0.347826,0.245364,0.058431
3721,0.200000,0.869565,0.160845,0.060100
3722,0.126667,0.637681,0.153083,0.055092


Unnamed: 0,volatile acidity,alcohol,density,chlorides
0,0.306667,0.173913,0.478223,0.136895
1,0.106667,0.173913,0.482536,0.118531
2,0.153333,0.289855,0.530832,0.060100
3,0.146667,0.434783,0.204398,0.053422
4,0.146667,0.347826,0.345839,0.070117
...,...,...,...,...
793,0.313333,0.333333,0.529970,0.131886
794,0.260000,0.806763,0.203105,0.060100
795,0.100000,0.246377,0.409228,0.043406
796,0.106667,0.202899,0.271238,0.066778


Unnamed: 0,1,2,3,volatile acidity,alcohol,density,chlorides
0,0,1,0,0.160000,0.405797,0.252264,0.043406
1,0,1,0,0.060000,0.420290,0.266925,0.035058
2,0,0,1,0.146667,0.173913,0.366106,0.070117
3,0,0,0,0.153333,0.565217,0.206123,0.043406
4,0,0,0,0.120000,0.637681,0.260457,0.043406
...,...,...,...,...,...,...,...
3719,0,1,0,0.033333,0.463768,0.143596,0.035058
3720,0,1,0,0.046667,0.347826,0.245364,0.058431
3721,0,0,0,0.200000,0.869565,0.160845,0.060100
3722,0,0,0,0.126667,0.637681,0.153083,0.055092


Unnamed: 0,1,2,3,volatile acidity,alcohol,density,chlorides
0,0,0,1,0.306667,0.173913,0.478223,0.136895
1,0,0,1,0.106667,0.173913,0.482536,0.118531
2,1,0,0,0.153333,0.289855,0.530832,0.060100
3,0,1,0,0.146667,0.434783,0.204398,0.053422
4,0,1,0,0.146667,0.347826,0.345839,0.070117
...,...,...,...,...,...,...,...
793,0,0,1,0.313333,0.333333,0.529970,0.131886
794,0,0,0,0.260000,0.806763,0.203105,0.060100
795,1,0,0,0.100000,0.246377,0.409228,0.043406
796,0,0,1,0.106667,0.202899,0.271238,0.066778


Cluster 5 Performance


Unnamed: 0,model,rmse_train,rmse_validate,r_validate,model_difference
0,Mean Baseline,0.88,1.00,0.00,0.010000
1,OLS,0.74,0.74,0.27,0.010000
2,LassoLars_a0.01,0.75,0.75,0.26,0.002832
3,LassoLars_a0.02,0.77,0.77,0.22,0.000701
4,LassoLars_a0.03,0.79,0.78,0.19,0.002552
...,...,...,...,...,...
5,LassoLars_a0.04,0.80,0.79,0.17,0.009134
6,LassoLars_a0.05,0.82,0.81,0.13,0.010434
7,Polynomial_deg2,0.73,0.75,0.26,0.020000
8,Polynomial_deg3,0.72,2.89,-10.00,2.170000


model                OLS
rmse_train          0.74
rmse_validate       0.74
r_validate          0.28
model_difference     0.0
Name: 1, dtype: object


Unnamed: 0,volatile acidity,alcohol,density,chlorides
0,0.160000,0.405797,0.252264,0.043406
1,0.060000,0.420290,0.266925,0.035058
2,0.146667,0.173913,0.366106,0.070117
3,0.153333,0.565217,0.206123,0.043406
4,0.120000,0.637681,0.260457,0.043406
...,...,...,...,...
3719,0.033333,0.463768,0.143596,0.035058
3720,0.046667,0.347826,0.245364,0.058431
3721,0.200000,0.869565,0.160845,0.060100
3722,0.126667,0.637681,0.153083,0.055092


Unnamed: 0,volatile acidity,alcohol,density,chlorides
0,0.306667,0.173913,0.478223,0.136895
1,0.106667,0.173913,0.482536,0.118531
2,0.153333,0.289855,0.530832,0.060100
3,0.146667,0.434783,0.204398,0.053422
4,0.146667,0.347826,0.345839,0.070117
...,...,...,...,...
793,0.313333,0.333333,0.529970,0.131886
794,0.260000,0.806763,0.203105,0.060100
795,0.100000,0.246377,0.409228,0.043406
796,0.106667,0.202899,0.271238,0.066778


Unnamed: 0,1,2,3,volatile acidity,alcohol,density,chlorides
0,0,1,0,0.160000,0.405797,0.252264,0.043406
1,0,1,0,0.060000,0.420290,0.266925,0.035058
2,0,0,1,0.146667,0.173913,0.366106,0.070117
3,1,0,0,0.153333,0.565217,0.206123,0.043406
4,1,0,0,0.120000,0.637681,0.260457,0.043406
...,...,...,...,...,...,...,...
3719,0,1,0,0.033333,0.463768,0.143596,0.035058
3720,0,1,0,0.046667,0.347826,0.245364,0.058431
3721,1,0,0,0.200000,0.869565,0.160845,0.060100
3722,1,0,0,0.126667,0.637681,0.153083,0.055092


Unnamed: 0,1,2,3,volatile acidity,alcohol,density,chlorides
0,0,0,0,0.306667,0.173913,0.478223,0.136895
1,0,0,1,0.106667,0.173913,0.482536,0.118531
2,0,0,1,0.153333,0.289855,0.530832,0.060100
3,0,1,0,0.146667,0.434783,0.204398,0.053422
4,0,1,0,0.146667,0.347826,0.345839,0.070117
...,...,...,...,...,...,...,...
793,0,0,0,0.313333,0.333333,0.529970,0.131886
794,1,0,0,0.260000,0.806763,0.203105,0.060100
795,0,0,1,0.100000,0.246377,0.409228,0.043406
796,0,0,1,0.106667,0.202899,0.271238,0.066778


Cluster 6 Performance


Unnamed: 0,model,rmse_train,rmse_validate,r_validate,model_difference
0,Mean Baseline,0.88,1.00,0.00,0.010000
1,OLS,0.74,0.74,0.27,0.010000
2,LassoLars_a0.01,0.75,0.75,0.26,0.002832
3,LassoLars_a0.02,0.77,0.77,0.22,0.000701
4,LassoLars_a0.03,0.79,0.78,0.19,0.002552
...,...,...,...,...,...
5,LassoLars_a0.04,0.79,0.78,0.19,0.002201
6,LassoLars_a0.05,0.79,0.79,0.17,0.001904
7,Polynomial_deg2,0.73,0.75,0.26,0.020000
8,Polynomial_deg3,0.72,2.58,-7.81,1.860000


model                OLS
rmse_train          0.74
rmse_validate       0.74
r_validate          0.28
model_difference     0.0
Name: 1, dtype: object


Unnamed: 0,volatile acidity,alcohol,density,chlorides
0,0.160000,0.405797,0.252264,0.043406
1,0.060000,0.420290,0.266925,0.035058
2,0.146667,0.173913,0.366106,0.070117
3,0.153333,0.565217,0.206123,0.043406
4,0.120000,0.637681,0.260457,0.043406
...,...,...,...,...
3719,0.033333,0.463768,0.143596,0.035058
3720,0.046667,0.347826,0.245364,0.058431
3721,0.200000,0.869565,0.160845,0.060100
3722,0.126667,0.637681,0.153083,0.055092


Unnamed: 0,volatile acidity,alcohol,density,chlorides
0,0.306667,0.173913,0.478223,0.136895
1,0.106667,0.173913,0.482536,0.118531
2,0.153333,0.289855,0.530832,0.060100
3,0.146667,0.434783,0.204398,0.053422
4,0.146667,0.347826,0.345839,0.070117
...,...,...,...,...
793,0.313333,0.333333,0.529970,0.131886
794,0.260000,0.806763,0.203105,0.060100
795,0.100000,0.246377,0.409228,0.043406
796,0.106667,0.202899,0.271238,0.066778


Unnamed: 0,1,2,3,volatile acidity,alcohol,density,chlorides
0,1,0,0,0.160000,0.405797,0.252264,0.043406
1,1,0,0,0.060000,0.420290,0.266925,0.035058
2,0,0,1,0.146667,0.173913,0.366106,0.070117
3,0,1,0,0.153333,0.565217,0.206123,0.043406
4,0,1,0,0.120000,0.637681,0.260457,0.043406
...,...,...,...,...,...,...,...
3719,1,0,0,0.033333,0.463768,0.143596,0.035058
3720,1,0,0,0.046667,0.347826,0.245364,0.058431
3721,0,1,0,0.200000,0.869565,0.160845,0.060100
3722,0,1,0,0.126667,0.637681,0.153083,0.055092


Unnamed: 0,1,2,3,volatile acidity,alcohol,density,chlorides
0,0,0,1,0.306667,0.173913,0.478223,0.136895
1,0,0,1,0.106667,0.173913,0.482536,0.118531
2,0,0,1,0.153333,0.289855,0.530832,0.060100
3,1,0,0,0.146667,0.434783,0.204398,0.053422
4,1,0,0,0.146667,0.347826,0.345839,0.070117
...,...,...,...,...,...,...,...
793,0,0,0,0.313333,0.333333,0.529970,0.131886
794,0,1,0,0.260000,0.806763,0.203105,0.060100
795,0,0,1,0.100000,0.246377,0.409228,0.043406
796,0,0,1,0.106667,0.202899,0.271238,0.066778


Cluster 7 Performance


Unnamed: 0,model,rmse_train,rmse_validate,r_validate,model_difference
0,Mean Baseline,0.88,1.00,0.00,0.010000
1,OLS,0.74,0.74,0.27,0.010000
2,LassoLars_a0.01,0.75,0.75,0.26,0.002832
3,LassoLars_a0.02,0.77,0.77,0.22,0.000701
4,LassoLars_a0.03,0.79,0.78,0.19,0.002552
...,...,...,...,...,...
5,LassoLars_a0.04,0.80,0.80,0.16,0.000713
6,LassoLars_a0.05,0.80,0.80,0.16,0.001257
7,Polynomial_deg2,0.73,0.79,0.18,0.060000
8,Polynomial_deg3,0.72,2.40,-6.61,1.680000


model               Polynomial_deg2
rmse_train                     0.73
rmse_validate                  0.74
r_validate                     0.29
model_difference                0.0
Name: 7, dtype: object


Unnamed: 0,volatile acidity,alcohol,density,chlorides
0,0.160000,0.405797,0.252264,0.043406
1,0.060000,0.420290,0.266925,0.035058
2,0.146667,0.173913,0.366106,0.070117
3,0.153333,0.565217,0.206123,0.043406
4,0.120000,0.637681,0.260457,0.043406
...,...,...,...,...
3719,0.033333,0.463768,0.143596,0.035058
3720,0.046667,0.347826,0.245364,0.058431
3721,0.200000,0.869565,0.160845,0.060100
3722,0.126667,0.637681,0.153083,0.055092


Unnamed: 0,volatile acidity,alcohol,density,chlorides
0,0.306667,0.173913,0.478223,0.136895
1,0.106667,0.173913,0.482536,0.118531
2,0.153333,0.289855,0.530832,0.060100
3,0.146667,0.434783,0.204398,0.053422
4,0.146667,0.347826,0.345839,0.070117
...,...,...,...,...
793,0.313333,0.333333,0.529970,0.131886
794,0.260000,0.806763,0.203105,0.060100
795,0.100000,0.246377,0.409228,0.043406
796,0.106667,0.202899,0.271238,0.066778


Unnamed: 0,1,2,3,volatile acidity,alcohol,density,chlorides
0,0,0,1,0.160000,0.405797,0.252264,0.043406
1,0,0,1,0.060000,0.420290,0.266925,0.035058
2,0,1,0,0.146667,0.173913,0.366106,0.070117
3,0,0,0,0.153333,0.565217,0.206123,0.043406
4,0,0,0,0.120000,0.637681,0.260457,0.043406
...,...,...,...,...,...,...,...
3719,0,0,1,0.033333,0.463768,0.143596,0.035058
3720,0,0,1,0.046667,0.347826,0.245364,0.058431
3721,0,0,0,0.200000,0.869565,0.160845,0.060100
3722,0,0,0,0.126667,0.637681,0.153083,0.055092


Unnamed: 0,1,2,3,volatile acidity,alcohol,density,chlorides
0,1,0,0,0.306667,0.173913,0.478223,0.136895
1,0,1,0,0.106667,0.173913,0.482536,0.118531
2,0,1,0,0.153333,0.289855,0.530832,0.060100
3,0,0,1,0.146667,0.434783,0.204398,0.053422
4,0,0,1,0.146667,0.347826,0.345839,0.070117
...,...,...,...,...,...,...,...
793,1,0,0,0.313333,0.333333,0.529970,0.131886
794,0,0,0,0.260000,0.806763,0.203105,0.060100
795,0,1,0,0.100000,0.246377,0.409228,0.043406
796,0,1,0,0.106667,0.202899,0.271238,0.066778


Cluster 8 Performance


Unnamed: 0,model,rmse_train,rmse_validate,r_validate,model_difference
0,Mean Baseline,0.88,1.00,0.00,0.010000
1,OLS,0.74,0.74,0.27,0.010000
2,LassoLars_a0.01,0.75,0.75,0.26,0.002832
3,LassoLars_a0.02,0.77,0.77,0.22,0.000701
4,LassoLars_a0.03,0.79,0.78,0.19,0.002552
...,...,...,...,...,...
5,LassoLars_a0.04,0.79,0.79,0.19,0.007117
6,LassoLars_a0.05,0.81,0.80,0.15,0.008301
7,Polynomial_deg2,0.73,0.75,0.26,0.020000
8,Polynomial_deg3,0.72,2.05,-4.57,1.330000


model               Polynomial_deg2
rmse_train                     0.73
rmse_validate                  0.74
r_validate                     0.29
model_difference                0.0
Name: 7, dtype: object


Unnamed: 0,volatile acidity,alcohol,density,chlorides
0,0.160000,0.405797,0.252264,0.043406
1,0.060000,0.420290,0.266925,0.035058
2,0.146667,0.173913,0.366106,0.070117
3,0.153333,0.565217,0.206123,0.043406
4,0.120000,0.637681,0.260457,0.043406
...,...,...,...,...
3719,0.033333,0.463768,0.143596,0.035058
3720,0.046667,0.347826,0.245364,0.058431
3721,0.200000,0.869565,0.160845,0.060100
3722,0.126667,0.637681,0.153083,0.055092


Unnamed: 0,volatile acidity,alcohol,density,chlorides
0,0.306667,0.173913,0.478223,0.136895
1,0.106667,0.173913,0.482536,0.118531
2,0.153333,0.289855,0.530832,0.060100
3,0.146667,0.434783,0.204398,0.053422
4,0.146667,0.347826,0.345839,0.070117
...,...,...,...,...
793,0.313333,0.333333,0.529970,0.131886
794,0.260000,0.806763,0.203105,0.060100
795,0.100000,0.246377,0.409228,0.043406
796,0.106667,0.202899,0.271238,0.066778


Unnamed: 0,1,2,3,volatile acidity,alcohol,density,chlorides
0,1,0,0,0.160000,0.405797,0.252264,0.043406
1,1,0,0,0.060000,0.420290,0.266925,0.035058
2,0,0,0,0.146667,0.173913,0.366106,0.070117
3,0,0,1,0.153333,0.565217,0.206123,0.043406
4,0,0,1,0.120000,0.637681,0.260457,0.043406
...,...,...,...,...,...,...,...
3719,0,0,1,0.033333,0.463768,0.143596,0.035058
3720,0,0,0,0.046667,0.347826,0.245364,0.058431
3721,0,0,1,0.200000,0.869565,0.160845,0.060100
3722,0,0,1,0.126667,0.637681,0.153083,0.055092


Unnamed: 0,1,2,3,volatile acidity,alcohol,density,chlorides
0,0,0,0,0.306667,0.173913,0.478223,0.136895
1,1,0,0,0.106667,0.173913,0.482536,0.118531
2,1,0,0,0.153333,0.289855,0.530832,0.060100
3,1,0,0,0.146667,0.434783,0.204398,0.053422
4,1,0,0,0.146667,0.347826,0.345839,0.070117
...,...,...,...,...,...,...,...
793,0,1,0,0.313333,0.333333,0.529970,0.131886
794,0,0,1,0.260000,0.806763,0.203105,0.060100
795,0,0,0,0.100000,0.246377,0.409228,0.043406
796,0,0,0,0.106667,0.202899,0.271238,0.066778


Cluster 9 Performance


Unnamed: 0,model,rmse_train,rmse_validate,r_validate,model_difference
0,Mean Baseline,0.88,1.00,0.00,0.010000
1,OLS,0.74,0.74,0.27,0.010000
2,LassoLars_a0.01,0.75,0.75,0.26,0.002832
3,LassoLars_a0.02,0.77,0.77,0.22,0.000701
4,LassoLars_a0.03,0.79,0.78,0.19,0.002552
...,...,...,...,...,...
5,LassoLars_a0.04,0.79,0.79,0.17,0.002378
6,LassoLars_a0.05,0.80,0.80,0.15,0.001869
7,Polynomial_deg2,0.73,0.75,0.26,0.020000
8,Polynomial_deg3,0.72,2.73,-8.80,2.010000


model                OLS
rmse_train          0.74
rmse_validate       0.74
r_validate          0.27
model_difference    0.01
Name: 1, dtype: object


Unnamed: 0,volatile acidity,alcohol,density,chlorides
0,0.160000,0.405797,0.252264,0.043406
1,0.060000,0.420290,0.266925,0.035058
2,0.146667,0.173913,0.366106,0.070117
3,0.153333,0.565217,0.206123,0.043406
4,0.120000,0.637681,0.260457,0.043406
...,...,...,...,...
3719,0.033333,0.463768,0.143596,0.035058
3720,0.046667,0.347826,0.245364,0.058431
3721,0.200000,0.869565,0.160845,0.060100
3722,0.126667,0.637681,0.153083,0.055092


Unnamed: 0,volatile acidity,alcohol,density,chlorides
0,0.306667,0.173913,0.478223,0.136895
1,0.106667,0.173913,0.482536,0.118531
2,0.153333,0.289855,0.530832,0.060100
3,0.146667,0.434783,0.204398,0.053422
4,0.146667,0.347826,0.345839,0.070117
...,...,...,...,...
793,0.313333,0.333333,0.529970,0.131886
794,0.260000,0.806763,0.203105,0.060100
795,0.100000,0.246377,0.409228,0.043406
796,0.106667,0.202899,0.271238,0.066778


Unnamed: 0,1,2,3,volatile acidity,alcohol,density,chlorides
0,0,1,0,0.160000,0.405797,0.252264,0.043406
1,0,1,0,0.060000,0.420290,0.266925,0.035058
2,0,0,0,0.146667,0.173913,0.366106,0.070117
3,1,0,0,0.153333,0.565217,0.206123,0.043406
4,1,0,0,0.120000,0.637681,0.260457,0.043406
...,...,...,...,...,...,...,...
3719,0,1,0,0.033333,0.463768,0.143596,0.035058
3720,0,1,0,0.046667,0.347826,0.245364,0.058431
3721,1,0,0,0.200000,0.869565,0.160845,0.060100
3722,1,0,0,0.126667,0.637681,0.153083,0.055092


Unnamed: 0,1,2,3,volatile acidity,alcohol,density,chlorides
0,0,0,0,0.306667,0.173913,0.478223,0.136895
1,0,0,0,0.106667,0.173913,0.482536,0.118531
2,0,0,0,0.153333,0.289855,0.530832,0.060100
3,0,1,0,0.146667,0.434783,0.204398,0.053422
4,0,1,0,0.146667,0.347826,0.345839,0.070117
...,...,...,...,...,...,...,...
793,0,0,1,0.313333,0.333333,0.529970,0.131886
794,1,0,0,0.260000,0.806763,0.203105,0.060100
795,0,0,0,0.100000,0.246377,0.409228,0.043406
796,0,0,0,0.106667,0.202899,0.271238,0.066778


Cluster 10 Performance


Unnamed: 0,model,rmse_train,rmse_validate,r_validate,model_difference
0,Mean Baseline,0.88,1.00,0.00,0.010000
1,OLS,0.74,0.74,0.27,0.010000
2,LassoLars_a0.01,0.75,0.75,0.26,0.002832
3,LassoLars_a0.02,0.77,0.77,0.22,0.000701
4,LassoLars_a0.03,0.79,0.78,0.19,0.002552
...,...,...,...,...,...
5,LassoLars_a0.04,0.80,0.80,0.16,0.001824
6,LassoLars_a0.05,0.81,0.81,0.14,0.000536
7,Polynomial_deg2,0.73,0.74,0.28,0.010000
8,Polynomial_deg3,0.72,1.77,-3.12,1.050000


model                OLS
rmse_train          0.74
rmse_validate       0.74
r_validate          0.28
model_difference     0.0
Name: 1, dtype: object


Unnamed: 0,volatile acidity,alcohol,density,chlorides
0,0.160000,0.405797,0.252264,0.043406
1,0.060000,0.420290,0.266925,0.035058
2,0.146667,0.173913,0.366106,0.070117
3,0.153333,0.565217,0.206123,0.043406
4,0.120000,0.637681,0.260457,0.043406
...,...,...,...,...
3719,0.033333,0.463768,0.143596,0.035058
3720,0.046667,0.347826,0.245364,0.058431
3721,0.200000,0.869565,0.160845,0.060100
3722,0.126667,0.637681,0.153083,0.055092


Unnamed: 0,volatile acidity,alcohol,density,chlorides
0,0.306667,0.173913,0.478223,0.136895
1,0.106667,0.173913,0.482536,0.118531
2,0.153333,0.289855,0.530832,0.060100
3,0.146667,0.434783,0.204398,0.053422
4,0.146667,0.347826,0.345839,0.070117
...,...,...,...,...
793,0.313333,0.333333,0.529970,0.131886
794,0.260000,0.806763,0.203105,0.060100
795,0.100000,0.246377,0.409228,0.043406
796,0.106667,0.202899,0.271238,0.066778


Unnamed: 0,1,2,3,volatile acidity,alcohol,density,chlorides
0,0,1,0,0.160000,0.405797,0.252264,0.043406
1,0,1,0,0.060000,0.420290,0.266925,0.035058
2,1,0,0,0.146667,0.173913,0.366106,0.070117
3,0,0,0,0.153333,0.565217,0.206123,0.043406
4,0,0,0,0.120000,0.637681,0.260457,0.043406
...,...,...,...,...,...,...,...
3719,0,0,1,0.033333,0.463768,0.143596,0.035058
3720,1,0,0,0.046667,0.347826,0.245364,0.058431
3721,0,0,0,0.200000,0.869565,0.160845,0.060100
3722,0,0,0,0.126667,0.637681,0.153083,0.055092


Unnamed: 0,1,2,3,volatile acidity,alcohol,density,chlorides
0,1,0,0,0.306667,0.173913,0.478223,0.136895
1,0,1,0,0.106667,0.173913,0.482536,0.118531
2,0,1,0,0.153333,0.289855,0.530832,0.060100
3,0,1,0,0.146667,0.434783,0.204398,0.053422
4,0,1,0,0.146667,0.347826,0.345839,0.070117
...,...,...,...,...,...,...,...
793,0,0,1,0.313333,0.333333,0.529970,0.131886
794,0,0,0,0.260000,0.806763,0.203105,0.060100
795,1,0,0,0.100000,0.246377,0.409228,0.043406
796,1,0,0,0.106667,0.202899,0.271238,0.066778


Cluster 11 Performance


Unnamed: 0,model,rmse_train,rmse_validate,r_validate,model_difference
0,Mean Baseline,0.88,1.00,0.00,0.010000
1,OLS,0.74,0.74,0.27,0.010000
2,LassoLars_a0.01,0.75,0.75,0.26,0.002832
3,LassoLars_a0.02,0.77,0.77,0.22,0.000701
4,LassoLars_a0.03,0.79,0.78,0.19,0.002552
...,...,...,...,...,...
5,LassoLars_a0.04,0.81,0.80,0.15,0.002084
6,LassoLars_a0.05,0.83,0.82,0.10,0.002296
7,Polynomial_deg2,0.73,0.76,0.23,0.040000
8,Polynomial_deg3,0.72,2.86,-9.76,2.140000


model               Polynomial_deg2
rmse_train                     0.73
rmse_validate                  0.74
r_validate                     0.28
model_difference               0.01
Name: 7, dtype: object


Unnamed: 0,volatile acidity,alcohol,density,chlorides
0,0.160000,0.405797,0.252264,0.043406
1,0.060000,0.420290,0.266925,0.035058
2,0.146667,0.173913,0.366106,0.070117
3,0.153333,0.565217,0.206123,0.043406
4,0.120000,0.637681,0.260457,0.043406
...,...,...,...,...
3719,0.033333,0.463768,0.143596,0.035058
3720,0.046667,0.347826,0.245364,0.058431
3721,0.200000,0.869565,0.160845,0.060100
3722,0.126667,0.637681,0.153083,0.055092


Unnamed: 0,volatile acidity,alcohol,density,chlorides
0,0.306667,0.173913,0.478223,0.136895
1,0.106667,0.173913,0.482536,0.118531
2,0.153333,0.289855,0.530832,0.060100
3,0.146667,0.434783,0.204398,0.053422
4,0.146667,0.347826,0.345839,0.070117
...,...,...,...,...
793,0.313333,0.333333,0.529970,0.131886
794,0.260000,0.806763,0.203105,0.060100
795,0.100000,0.246377,0.409228,0.043406
796,0.106667,0.202899,0.271238,0.066778


Unnamed: 0,1,2,3,volatile acidity,alcohol,density,chlorides
0,0,0,1,0.160000,0.405797,0.252264,0.043406
1,0,0,1,0.060000,0.420290,0.266925,0.035058
2,0,0,1,0.146667,0.173913,0.366106,0.070117
3,1,0,0,0.153333,0.565217,0.206123,0.043406
4,1,0,0,0.120000,0.637681,0.260457,0.043406
...,...,...,...,...,...,...,...
3719,1,0,0,0.033333,0.463768,0.143596,0.035058
3720,0,0,1,0.046667,0.347826,0.245364,0.058431
3721,1,0,0,0.200000,0.869565,0.160845,0.060100
3722,1,0,0,0.126667,0.637681,0.153083,0.055092


Unnamed: 0,1,2,3,volatile acidity,alcohol,density,chlorides
0,0,1,0,0.306667,0.173913,0.478223,0.136895
1,0,1,0,0.106667,0.173913,0.482536,0.118531
2,0,0,0,0.153333,0.289855,0.530832,0.060100
3,1,0,0,0.146667,0.434783,0.204398,0.053422
4,0,0,1,0.146667,0.347826,0.345839,0.070117
...,...,...,...,...,...,...,...
793,0,1,0,0.313333,0.333333,0.529970,0.131886
794,1,0,0,0.260000,0.806763,0.203105,0.060100
795,0,0,0,0.100000,0.246377,0.409228,0.043406
796,0,0,1,0.106667,0.202899,0.271238,0.066778


Cluster 12 Performance


Unnamed: 0,model,rmse_train,rmse_validate,r_validate,model_difference
0,Mean Baseline,0.88,1.00,0.00,0.010000
1,OLS,0.74,0.74,0.27,0.010000
2,LassoLars_a0.01,0.75,0.75,0.26,0.002832
3,LassoLars_a0.02,0.77,0.77,0.22,0.000701
4,LassoLars_a0.03,0.79,0.78,0.19,0.002552
...,...,...,...,...,...
5,LassoLars_a0.04,0.81,0.80,0.16,0.006760
6,LassoLars_a0.05,0.81,0.80,0.15,0.006758
7,Polynomial_deg2,0.73,0.86,0.01,0.130000
8,Polynomial_deg3,0.72,0.77,0.22,0.050000


model                OLS
rmse_train          0.74
rmse_validate       0.74
r_validate          0.27
model_difference    0.01
Name: 1, dtype: object


Unnamed: 0,volatile acidity,alcohol,density,chlorides
0,0.160000,0.405797,0.252264,0.043406
1,0.060000,0.420290,0.266925,0.035058
2,0.146667,0.173913,0.366106,0.070117
3,0.153333,0.565217,0.206123,0.043406
4,0.120000,0.637681,0.260457,0.043406
...,...,...,...,...
3719,0.033333,0.463768,0.143596,0.035058
3720,0.046667,0.347826,0.245364,0.058431
3721,0.200000,0.869565,0.160845,0.060100
3722,0.126667,0.637681,0.153083,0.055092


Unnamed: 0,volatile acidity,alcohol,density,chlorides
0,0.306667,0.173913,0.478223,0.136895
1,0.106667,0.173913,0.482536,0.118531
2,0.153333,0.289855,0.530832,0.060100
3,0.146667,0.434783,0.204398,0.053422
4,0.146667,0.347826,0.345839,0.070117
...,...,...,...,...
793,0.313333,0.333333,0.529970,0.131886
794,0.260000,0.806763,0.203105,0.060100
795,0.100000,0.246377,0.409228,0.043406
796,0.106667,0.202899,0.271238,0.066778


Unnamed: 0,1,2,3,volatile acidity,alcohol,density,chlorides
0,0,1,0,0.160000,0.405797,0.252264,0.043406
1,0,0,0,0.060000,0.420290,0.266925,0.035058
2,0,1,0,0.146667,0.173913,0.366106,0.070117
3,0,0,1,0.153333,0.565217,0.206123,0.043406
4,0,0,0,0.120000,0.637681,0.260457,0.043406
...,...,...,...,...,...,...,...
3719,0,0,1,0.033333,0.463768,0.143596,0.035058
3720,0,0,1,0.046667,0.347826,0.245364,0.058431
3721,0,1,0,0.200000,0.869565,0.160845,0.060100
3722,0,0,1,0.126667,0.637681,0.153083,0.055092


Unnamed: 0,1,2,3,volatile acidity,alcohol,density,chlorides
0,1,0,0,0.306667,0.173913,0.478223,0.136895
1,0,0,1,0.106667,0.173913,0.482536,0.118531
2,0,1,0,0.153333,0.289855,0.530832,0.060100
3,0,1,0,0.146667,0.434783,0.204398,0.053422
4,0,0,1,0.146667,0.347826,0.345839,0.070117
...,...,...,...,...,...,...,...
793,1,0,0,0.313333,0.333333,0.529970,0.131886
794,0,1,0,0.260000,0.806763,0.203105,0.060100
795,0,0,1,0.100000,0.246377,0.409228,0.043406
796,0,0,0,0.106667,0.202899,0.271238,0.066778


Cluster 13 Performance


Unnamed: 0,model,rmse_train,rmse_validate,r_validate,model_difference
0,Mean Baseline,0.88,1.00,0.00,0.010000
1,OLS,0.74,0.74,0.27,0.010000
2,LassoLars_a0.01,0.75,0.75,0.26,0.002832
3,LassoLars_a0.02,0.77,0.77,0.22,0.000701
4,LassoLars_a0.03,0.79,0.78,0.19,0.002552
...,...,...,...,...,...
5,LassoLars_a0.04,0.80,0.80,0.16,0.003466
6,LassoLars_a0.05,0.82,0.82,0.12,0.004358
7,Polynomial_deg2,0.73,0.92,-0.11,0.190000
8,Polynomial_deg3,0.72,2.46,-6.99,1.740000


model                OLS
rmse_train          0.74
rmse_validate       0.74
r_validate          0.27
model_difference    0.01
Name: 1, dtype: object


Unnamed: 0,volatile acidity,alcohol,density,chlorides
0,0.160000,0.405797,0.252264,0.043406
1,0.060000,0.420290,0.266925,0.035058
2,0.146667,0.173913,0.366106,0.070117
3,0.153333,0.565217,0.206123,0.043406
4,0.120000,0.637681,0.260457,0.043406
...,...,...,...,...
3719,0.033333,0.463768,0.143596,0.035058
3720,0.046667,0.347826,0.245364,0.058431
3721,0.200000,0.869565,0.160845,0.060100
3722,0.126667,0.637681,0.153083,0.055092


Unnamed: 0,volatile acidity,alcohol,density,chlorides
0,0.306667,0.173913,0.478223,0.136895
1,0.106667,0.173913,0.482536,0.118531
2,0.153333,0.289855,0.530832,0.060100
3,0.146667,0.434783,0.204398,0.053422
4,0.146667,0.347826,0.345839,0.070117
...,...,...,...,...
793,0.313333,0.333333,0.529970,0.131886
794,0.260000,0.806763,0.203105,0.060100
795,0.100000,0.246377,0.409228,0.043406
796,0.106667,0.202899,0.271238,0.066778


Unnamed: 0,1,2,3,volatile acidity,alcohol,density,chlorides
0,1,0,0,0.160000,0.405797,0.252264,0.043406
1,1,0,0,0.060000,0.420290,0.266925,0.035058
2,0,1,0,0.146667,0.173913,0.366106,0.070117
3,0,0,1,0.153333,0.565217,0.206123,0.043406
4,0,0,1,0.120000,0.637681,0.260457,0.043406
...,...,...,...,...,...,...,...
3719,1,0,0,0.033333,0.463768,0.143596,0.035058
3720,1,0,0,0.046667,0.347826,0.245364,0.058431
3721,0,0,1,0.200000,0.869565,0.160845,0.060100
3722,0,0,1,0.126667,0.637681,0.153083,0.055092


Unnamed: 0,1,2,3,volatile acidity,alcohol,density,chlorides
0,0,0,0,0.306667,0.173913,0.478223,0.136895
1,0,1,0,0.106667,0.173913,0.482536,0.118531
2,0,1,0,0.153333,0.289855,0.530832,0.060100
3,1,0,0,0.146667,0.434783,0.204398,0.053422
4,1,0,0,0.146667,0.347826,0.345839,0.070117
...,...,...,...,...,...,...,...
793,0,0,0,0.313333,0.333333,0.529970,0.131886
794,0,0,1,0.260000,0.806763,0.203105,0.060100
795,0,1,0,0.100000,0.246377,0.409228,0.043406
796,0,1,0,0.106667,0.202899,0.271238,0.066778


Cluster 14 Performance


Unnamed: 0,model,rmse_train,rmse_validate,r_validate,model_difference
0,Mean Baseline,0.88,1.00,0.00,0.010000
1,OLS,0.74,0.74,0.27,0.010000
2,LassoLars_a0.01,0.75,0.75,0.26,0.002832
3,LassoLars_a0.02,0.77,0.77,0.22,0.000701
4,LassoLars_a0.03,0.79,0.78,0.19,0.002552
...,...,...,...,...,...
5,LassoLars_a0.04,0.79,0.79,0.19,0.004627
6,LassoLars_a0.05,0.80,0.80,0.17,0.004011
7,Polynomial_deg2,0.73,0.76,0.24,0.030000
8,Polynomial_deg3,0.72,1.82,-3.37,1.100000


model                OLS
rmse_train          0.74
rmse_validate       0.74
r_validate          0.27
model_difference    0.01
Name: 1, dtype: object


Unnamed: 0,volatile acidity,alcohol,density,chlorides
0,0.160000,0.405797,0.252264,0.043406
1,0.060000,0.420290,0.266925,0.035058
2,0.146667,0.173913,0.366106,0.070117
3,0.153333,0.565217,0.206123,0.043406
4,0.120000,0.637681,0.260457,0.043406
...,...,...,...,...
3719,0.033333,0.463768,0.143596,0.035058
3720,0.046667,0.347826,0.245364,0.058431
3721,0.200000,0.869565,0.160845,0.060100
3722,0.126667,0.637681,0.153083,0.055092


Unnamed: 0,volatile acidity,alcohol,density,chlorides
0,0.306667,0.173913,0.478223,0.136895
1,0.106667,0.173913,0.482536,0.118531
2,0.153333,0.289855,0.530832,0.060100
3,0.146667,0.434783,0.204398,0.053422
4,0.146667,0.347826,0.345839,0.070117
...,...,...,...,...
793,0.313333,0.333333,0.529970,0.131886
794,0.260000,0.806763,0.203105,0.060100
795,0.100000,0.246377,0.409228,0.043406
796,0.106667,0.202899,0.271238,0.066778


Unnamed: 0,1,2,3,volatile acidity,alcohol,density,chlorides
0,0,0,0,0.160000,0.405797,0.252264,0.043406
1,0,0,0,0.060000,0.420290,0.266925,0.035058
2,0,0,1,0.146667,0.173913,0.366106,0.070117
3,1,0,0,0.153333,0.565217,0.206123,0.043406
4,1,0,0,0.120000,0.637681,0.260457,0.043406
...,...,...,...,...,...,...,...
3719,0,0,0,0.033333,0.463768,0.143596,0.035058
3720,0,0,0,0.046667,0.347826,0.245364,0.058431
3721,1,0,0,0.200000,0.869565,0.160845,0.060100
3722,1,0,0,0.126667,0.637681,0.153083,0.055092


Unnamed: 0,1,2,3,volatile acidity,alcohol,density,chlorides
0,0,0,1,0.306667,0.173913,0.478223,0.136895
1,0,0,1,0.106667,0.173913,0.482536,0.118531
2,0,0,1,0.153333,0.289855,0.530832,0.060100
3,0,0,0,0.146667,0.434783,0.204398,0.053422
4,0,0,0,0.146667,0.347826,0.345839,0.070117
...,...,...,...,...,...,...,...
793,0,1,0,0.313333,0.333333,0.529970,0.131886
794,1,0,0,0.260000,0.806763,0.203105,0.060100
795,0,0,1,0.100000,0.246377,0.409228,0.043406
796,0,0,1,0.106667,0.202899,0.271238,0.066778


Cluster 15 Performance


Unnamed: 0,model,rmse_train,rmse_validate,r_validate,model_difference
0,Mean Baseline,0.88,1.00,0.00,0.010000
1,OLS,0.74,0.74,0.27,0.010000
2,LassoLars_a0.01,0.75,0.75,0.26,0.002832
3,LassoLars_a0.02,0.77,0.77,0.22,0.000701
4,LassoLars_a0.03,0.79,0.78,0.19,0.002552
...,...,...,...,...,...
5,LassoLars_a0.04,0.78,0.78,0.19,0.002030
6,LassoLars_a0.05,0.79,0.79,0.17,0.001675
7,Polynomial_deg2,0.73,0.75,0.27,0.020000
8,Polynomial_deg3,0.72,3.36,-13.91,2.640000


model                OLS
rmse_train          0.74
rmse_validate       0.74
r_validate          0.27
model_difference    0.01
Name: 1, dtype: object


Unnamed: 0,model,rmse_train,rmse_validate,r_validate,model_difference
0,Mean Baseline,0.88,1.00,0.00,0.010000
1,OLS,0.74,0.74,0.27,0.010000
2,LassoLars_a0.01,0.75,0.75,0.26,0.002832
3,LassoLars_a0.02,0.77,0.77,0.22,0.000701
4,LassoLars_a0.03,0.79,0.78,0.19,0.002552
...,...,...,...,...,...
5,LassoLars_a0.04,0.78,0.78,0.19,0.002030
6,LassoLars_a0.05,0.79,0.79,0.17,0.001675
7,Polynomial_deg2,0.73,0.75,0.27,0.020000
8,Polynomial_deg3,0.72,3.36,-13.91,2.640000


In [219]:
print('Results:')
for i in [4,7,8,11]:
    print(f'''
k=4 clustering off of {best_relationships[i]} 
fed into a Polynomial_deg2 model produced a .01 percent boost in model performance
''')

Results:

k=4 clustering off of ['residual sugar', 'free sulfur dioxide', 'alcohol'] 
fed into a Polynomial_deg2 model produced a .01 percent boost in model performance


k=4 clustering off of ['volatile acidity', 'free sulfur dioxide', 'alcohol'] 
fed into a Polynomial_deg2 model produced a .01 percent boost in model performance


k=4 clustering off of ['alcohol', 'pH', 'volatile acidity'] 
fed into a Polynomial_deg2 model produced a .01 percent boost in model performance


k=4 clustering off of ['alcohol', 'fixed acidity', 'residual sugar'] 
fed into a Polynomial_deg2 model produced a .01 percent boost in model performance



In [None]:
# convert target columns to dataframes
y_train = pd.DataFrame(y_train)
y_validate = pd.DataFrame(y_validate)

# Assign the baseline
y_train['baseline'] = y_train.quality.mean()
y_validate['baseline']= y_validate.quality.mean()

#

In [None]:
features = ['alcohol', 'volatile acidity', 'chlorides']
target = ['quality']

train, validate, test = split_data(df[features + target + ['red']],
                                   validate_size=.15, test_size=.15, 
                                   stratify_col='red', random_state=123)

# drop color column
train = train.iloc[:,:-1]
validate = validate.iloc[:,:-1]
test = test.iloc[:,:-1]

In [None]:
print(len(train), len(validate), len(test))
train.head()

In [None]:
# remove target
X_train = train[features]
X_validate = validate[features]
X_test = test[features]

# only add target
y_train = train[target]
y_vaildate = validate[target]
y_test = test[target]

In [None]:
scaler = MinMaxScaler()

In [None]:
scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_validate_scaled = scaler.transform(X_validate)
X_test_scaled = scaler.transform(X_test)

## Modeling before Clustering

**Baseline Model**

In [None]:
dummy = DummyRegressor().fit(X_train_scaled, y_train)

In [None]:
train['baseline_pred'] = dummy.predict(X_train_scaled)

In [None]:
train.head()

Evaluate

In [None]:
# RMSE
mean_squared_error(train['quality'],
                   train['baseline_pred'],
                   squared=False)

In [None]:
# R2
r2_score(train['quality'],
                   train['baseline_pred'])

**Linear Regression Model**

In [None]:
def run_lm_model(X, y, features):
    
    # run model
    lm = LinearRegression().fit(X, y)
    
    # RMSE
    rmse = mean_squared_error(y, lm.predict(X), squared=False)
    # R2
    r2 = r2_score(y, lm.predict(X))
    
    
    print(f'RMSE = {rmse}\nR2 = {r2}')
    display(pd.DataFrame(index=features + ['intercept'],
             columns=['coefficients'],
             data=np.append(lm.coef_ * scaler.scale_, lm.intercept_)))
    
    return rmse, r2

In [None]:
run_lm_model(X_train, y_train)

Our model starts its prediction at 5.26 and:
- adds .33 for every 1 unit of alcohol
- subtracts -1.25 for every .1 units of volatile acidity
- adds .03 for every .1 for every .1 units of chorides

## Preprocessing after Clustering

Features: `['alcohol', 'volatile acidity', 'chlorides', 'clusters_1']`


Encode clusters

In [None]:
df = pd.concat([df, 
                pd.get_dummies(df[['clusters_1','clusters_2','clusters_3']].astype(str))],
                axis=1)
df.head()

Split data

In [None]:
features = ['alcohol', 'volatile acidity', 'chlorides',
            'clusters_1_0', 'clusters_1_1',
            'clusters_1_2', 'clusters_1_3']
target = ['quality']

train, validate, test = split_data(df[features + target + ['red']],
                                   validate_size=.15, test_size=.15, 
                                   stratify_col='red', random_state=123)

# drop color column
train = train.iloc[:,:-1]
validate = validate.iloc[:,:-1]
test = test.iloc[:,:-1]

In [None]:
print(len(train), len(validate), len(test))
train.head()

In [None]:
# remove target
X_train = train[features]
X_validate = validate[features]
X_test = test[features]

# only add target
y_train = train[target]
y_vaildate = validate[target]
y_test = test[target]

In [None]:
scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_validate_scaled = scaler.transform(X_validate)
X_test_scaled = scaler.transform(X_test)

## Modeling on first group of clusters

**Linear Regression Model**

In [None]:
run_lm_model(X_train, y_train)