# Model

Plan:

- Develop a model to predict property value
- Use drivers identified in explore to build predictive regression models
- Create and run a baseline model with sklearn's `DummyRegressor` to compare our results to
- Create and run `Linear Regression`, `LassoLars`, and Polynomial regression models
- Use the insights from the highest-performing model (with highest test RMSE) to confirm our initial hypotheses and insights on the features that are the biggest drivers of property value

In [64]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import PolynomialFeatures

from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, LassoLars

from sklearn.metrics import mean_squared_error, r2_score

from sklearn.preprocessing import MinMaxScaler

from wrangle import split_data


## Preprocessing before Clustering

Features: `['alcohol', 'volatile acidity', 'chlorides']`

Encode Clusters

Scale features:
- MinMax

Before scaling, split data

In [65]:
df = pd.read_csv('wine_data_model.csv') 
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,red,clusters_1,clusters_2,clusters_3
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1,0,1,3
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,1,0,1,3
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,1,0,1,3
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,1,1,3,3
4,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5,1,0,1,3


Encode Clusters

In [94]:
df = pd.concat([df, 
                pd.get_dummies(df[['clusters_1','clusters_2','clusters_3']].astype(str))],
                axis=1)
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,...,clusters_1_2,clusters_1_3,clusters_2_0,clusters_2_1,clusters_2_2,clusters_2_3,clusters_3_0,clusters_3_1,clusters_3_2,clusters_3_3
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,...,0,0,0,1,0,0,0,0,0,1
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,...,0,0,0,1,0,0,0,0,0,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,...,0,0,0,1,0,0,0,0,0,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,...,0,0,0,0,0,1,0,0,0,1
4,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,...,0,0,0,1,0,0,0,0,0,1


Split data

In [95]:
train, validate, test = split_data(df,
                                   validate_size=.15, test_size=.15, 
                                   stratify_col='red', random_state=123)

Scale data

In [96]:
scaler = MinMaxScaler()

train_sc = pd.concat([pd.DataFrame(data=scaler.fit_transform(train.drop(columns=['quality'])),
                                   columns=train.drop(columns=['quality']).columns),
                      train[['quality']].reset_index().iloc[:,1]],
                      axis=1)

validate_sc = pd.concat([pd.DataFrame(data=scaler.transform(validate.drop(columns=['quality'])),
                                   columns=validate.drop(columns=['quality']).columns),
                         validate[['quality']].reset_index().iloc[:,1]],
                         axis=1)

test_sc = pd.concat([pd.DataFrame(data=scaler.transform(test.drop(columns=['quality'])),
                                   columns=test.drop(columns=['quality']).columns),
                     test[['quality']].reset_index().iloc[:,1]],
                     axis=1)

## Modeling before Clustering

In [98]:
features = ['alcohol', 'volatile acidity', 'chlorides']
target = ['quality']

In [70]:
# pick features
X_train = train_sc[features]
X_validate = validate_sc[features]
X_test = test_sc[features]

# only add target
y_train = train_sc[target]
y_vaildate = validate_sc[target]
y_test = test_sc[target]

**Baseline Model**

In [71]:
dummy = DummyRegressor().fit(X_train, y_train)

Evaluate

In [74]:
# RMSE
mean_squared_error(y_train, dummy.predict(X_train),
                   squared=False)

0.8813204007103415

In [75]:
# R2
r2_score(y_train, dummy.predict(X_train))

0.0

**Linear Regression Model**

In [76]:
def run_lm_model(X, y):
    
    # run model
    lm = LinearRegression().fit(X, y)
    
    # RMSE
    rmse = mean_squared_error(y, lm.predict(X), squared=False)
    # R2
    r2 = r2_score(y, lm.predict(X))
    
    
    print(f'RMSE = {rmse}\nR2 = {r2}')
    
    return rmse, r2

In [77]:
run_lm_model(X_train, y_train)

RMSE = 0.7453522878997778
R2 = 0.2847538458381518


(0.7453522878997778, 0.2847538458381518)

In [78]:
run_lm_model(X_validate, y_vaildate)

RMSE = 0.7438028614227141
R2 = 0.28949845982961275


(0.7438028614227141, 0.28949845982961275)

## Modeling on first group of clusters

In [79]:
features = ['alcohol', 'volatile acidity', 'chlorides', 'clusters_1_0', 
            'clusters_1_1', 'clusters_1_2', 'clusters_1_3']
target = ['quality']

In [80]:
# pick features
X_train = train_sc[features]
X_validate = validate_sc[features]
X_test = test_sc[features]

# only add target
y_train = train_sc[target]
y_vaildate = validate_sc[target]
y_test = test_sc[target]

**Linear Regression Model**

In [81]:
run_lm_model(X_train, y_train)

RMSE = 0.7440900098919532
R2 = 0.28717437908506205


(0.7440900098919532, 0.28717437908506205)

In [82]:
run_lm_model(X_validate, y_vaildate)

RMSE = 0.7412573376185873
R2 = 0.2943532510241843


(0.7412573376185873, 0.2943532510241843)

## Modeling on second group of clusters

In [83]:
features = ['alcohol', 'volatile acidity', 'chlorides', 'clusters_2_0', 
            'clusters_2_1', 'clusters_2_2', 'clusters_2_3']
target = ['quality']

In [84]:
# pick features
X_train = train_sc[features]
X_validate = validate_sc[features]
X_test = test_sc[features]

# only add target
y_train = train_sc[target]
y_vaildate = validate_sc[target]
y_test = test_sc[target]

**Linear Regression Model**

In [85]:
run_lm_model(X_train, y_train)

RMSE = 0.7440498929526359
R2 = 0.28725123970924427


(0.7440498929526359, 0.28725123970924427)

In [86]:
run_lm_model(X_validate, y_vaildate)

RMSE = 0.7419938569885327
R2 = 0.29295028157527736


(0.7419938569885327, 0.29295028157527736)

## Modeling on third group of clusters

In [87]:
features = ['alcohol', 'volatile acidity', 'chlorides', 'clusters_3_0', 
            'clusters_3_1', 'clusters_3_2', 'clusters_3_3']
target = ['quality']

In [88]:
# pick features
X_train = train_sc[features]
X_validate = validate_sc[features]
X_test = test_sc[features]

# only add target
y_train = train_sc[target]
y_vaildate = validate_sc[target]
y_test = test_sc[target]

**Linear Regression Model**

In [89]:
run_lm_model(X_train, y_train)

RMSE = 0.7443081897965984
R2 = 0.2867562924957725


(0.7443081897965984, 0.2867562924957725)

In [90]:
run_lm_model(X_validate, y_vaildate)

RMSE = 0.7419954346029419
R2 = 0.2929472749384997


(0.7419954346029419, 0.2929472749384997)