In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('../input/wine-quality/wineQualityWhites.csv', sep = ',')

In [None]:
df.head()

In [None]:
df.drop('Unnamed: 0', axis = 1, inplace = True)

---

### 1. Separate the target feature[‘quality], split data in 7:3 proportion (30% form a holdout set, use random_state=17), and preprocess data with StandardScaler

In [None]:
X = df.drop('quality', axis = 1)
y = df['quality']

In [None]:
from sklearn.model_selection import train_test_split

#### Splitting the Data with 30% as Holdout Set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 17)

#### Scaling the Data

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

#### Creating variables for Scaled X_train and X_test

In [None]:
X_train_scale = scaler.fit_transform(X_train)
X_test_scale = scaler.fit_transform(X_test)

---

### 2. Train a simple linear regression model using sci-kit learn.

In [None]:
from sklearn.linear_model import LinearRegression

#### Initializing LinearRegression() object and fitting it

In [None]:
linreg = LinearRegression()
linreg.fit(X_train_scale, y_train)

---

### 3. What are mean squared errors of model predictions on train and holdout sets?

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
y_train_pred = linreg.predict(X_train_scale)
y_test_pred = linreg.predict(X_test_scale)

#### Mean Squared Error of Model Predictions on Train Set

In [None]:
round(mean_squared_error(y_train, y_train_pred), 2)

#### Mean Squared Error of Model Predictions on Holdout Set

In [None]:
round(mean_squared_error(y_test, y_test_pred), 2)

---

### 4. Create a data frame to display coefficients of each features.

In [None]:
coef_df = pd.DataFrame({'Features': X.columns, 'Coeff.': linreg.coef_})
coef_df['Coeff.'] = coef_df['Coeff.'].round(3)

In [None]:
coef_df

---

### 5. Which feature this linear regression model treats as the most influential on wine quality?

In [None]:
coef_df.sort_values(by = 'Coeff.', ascending = False)

#### Thus, 'residual sugar' influences most POSITIVELY with coefficent of 0.538 while 'density' affects quality influences most NEGATIVELY with coefficient -0.666

---

### 6. Train a LASSO model with α=0.01 and scaled data. Again, set random_state=17.

In [None]:
from sklearn.linear_model import Lasso

#### Initializing Lasso() object and fitting it, random_state 17 and max_iter = 250)

In [None]:
lasso = Lasso(alpha = 0.01, max_iter = 250, random_state = 17)

In [None]:
lasso.fit(X_train_scale, y_train)

---

### 7. Which feature is the least informative in predicting wine quality, according to this LASSO model?

#### Building a dataframe with Features and their Coefficient

In [None]:
coef_lasso_df = pd.DataFrame({'Features': X.columns, 'Coeff.': lasso.coef_})
coef_lasso_df['Coeff.'] = coef_lasso_df['Coeff.'].round(3)

In [None]:
coef_lasso_df.sort_values(by = 'Coeff.', ascending = False)

#### Thus, 'fixed acidity', 'citric acid', 'total sulfur dioxide' are the least informative according to Lasso Model with coefficient of 0.00

---

### 8. Train LassoCV with random_state=17 to choose the best value of α- alpha in 5-fold cross-validation.(use LassoCV instead of Gridsearch). The list of alphas to be passed are [0.01,0.001,0.1,0.2,0.02,0.002].

In [None]:
from sklearn.linear_model import LassoCV

In [None]:
best_lasso = LassoCV(alphas = [0.01,0.001,0.1,0.2,0.02,0.002], cv = 5, random_state = 17)

#### Initializing LassoCV() object and fitting it, with alpha = [0.01,0.001,0.1,0.2,0.02,0.002], cv = 5 and random_state 17

In [None]:
best_lasso.fit(X_train_scale, y_train)

In [None]:
best_lasso.alpha_

#### Thus, from LassoCV we get  best alpha value of 0.001

---

### 9. Which feature is the least informative in predicting wine quality, according to the tuned LASSO model?

In [None]:
coef_best_lasso_df = pd.DataFrame({'Features': X.columns, 'Coeff.': best_lasso.coef_})
coef_best_lasso_df['Coeff.'] = coef_best_lasso_df['Coeff.'].round(3)

In [None]:
coef_best_lasso_df.sort_values(by = 'Coeff.', ascending = False)

#### Thus, 'citric acid' is least informative according to the Tuned Lasso Model with a coefficient of 0.00

---

### 10. What are mean squared errors of tuned LASSO predictions on train and holdout sets?

In [None]:
# Building Lassso with best params

In [None]:
tuned_lasso = Lasso(alpha = 0.001, max_iter = 250, random_state = 17)

#### Initializing tuned Lasso() object and fitting it, with alpha = 0.001 and random_state 17

In [None]:
tuned_lasso.fit(X_train_scale, y_train)

In [None]:
y_train_pred_tuned_lasso = tuned_lasso.predict( X_train_scale)
y_test_pred_tuned_lasso = tuned_lasso.predict( X_test_scale)

#### Mean Squared Error of Model Predictions on Train Set

In [None]:
round(mean_squared_error(y_train, y_train_pred_tuned_lasso),2)

#### Mean Squared Error of Model Predictions on Holdout Set

In [None]:
round(mean_squared_error(y_test, y_test_pred_tuned_lasso),2)

---

### 11. Train a Random Forest, setting only random_state to be 17

In [None]:
from sklearn.ensemble import RandomForestRegressor

#### Initializing RandomForestRegressor() object and fitting it, random_state 17

In [None]:
random_reg = RandomForestRegressor(random_state = 17)

In [None]:
random_reg.fit(X_train_scale, y_train)

---

### 12. What are mean squared errors of tuned randomforest predictions on train and holdout sets?

In [None]:
y_train_pred_random_reg = random_reg.predict(X_train_scale)
y_test_pred_random_reg = random_reg.predict(X_test_scale)

#### Mean Squared Error of Model Predictions on Train Set

In [None]:
round(mean_squared_error(y_train, y_train_pred_random_reg), 2)

#### Mean Squared Error of Model Predictions on Holdout Set

In [None]:
round(mean_squared_error(y_test, y_test_pred_random_reg),2)

---

### 13. Tune the max_features and max_depth hyperparameters with GridSearchCV and again check mean cross-validation MSE and MSE on holdout set. Parameters to tune 

### forest_params = {'max_depth': list(range(10, 25)),'max_features': list(range(6,12))}


In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
forest_params = [{'max_depth': list(range(10, 25)), 'max_features': list(range(6,12))}]

#### Initializing GridSearchCV() object and fitting it with forest_params, and cv = 10

In [None]:
rf_gcv = GridSearchCV(RandomForestRegressor(random_state = 17), forest_params, cv=10)

In [None]:
rf_gcv.fit(X_train_scale, y_train)

In [None]:
rf_gcv.best_params_

#### Thus, we got best params as max_depth = 22 and max_features = 6

#### Building again RandomForestRegressor Model with Best Params from Grid Search CV

In [None]:
rf_tuned = RandomForestRegressor(max_depth = 22, max_features = 6, random_state = 17)

In [None]:
rf_tuned.fit(X_train_scale, y_train)

In [None]:
y_test_pred_rf_tuned = rf_tuned.predict(X_test_scale)

#### Finding the MSE of Holdout Set

In [None]:
round(mean_squared_error(y_test, y_test_pred_rf_tuned), 3)

In [None]:
from sklearn.model_selection import cross_val_score

#### Cross Validation Score for each run of Cross Validation

In [None]:
cross_val_score(RandomForestRegressor(max_depth = 22, max_features = 6, random_state = 17), X_train_scale, y_train)

---

### 14. Output RF's feature importance. Again, it's nice to present it as a DataFrame. What is the most important feature, according to the Random Forest model?

In [None]:
coef_rf_tuned_df = pd.DataFrame({'Features': X.columns, 'Importance': rf_tuned.feature_importances_})
coef_rf_tuned_df['Importance'] = coef_rf_tuned_df['Importance'].round(3)

In [None]:
coef_rf_tuned_df.sort_values(by = 'Importance', ascending = False)

#### Thus, **'alcohol'** is the most important factor according to the Random Forest Regressor with Tuned Parameters

---