In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1.0 Importing the dataset directly from Kaggle 




## 1.1 Importing packages and dataset



In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
import pandas as pd 
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display

from scipy.stats import pearsonr, skew, boxcox, chi2
from scipy.stats.stats import pearsonr
from scipy import stats

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, RobustScaler, PowerTransformer, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import plot_confusion_matrix,classification_report,mean_absolute_error
from sklearn.svm import SVC

from xgboost import XGBClassifier

from pprint import pprint

from imblearn.over_sampling import SMOTE

In [None]:
df = pd.read_csv("/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv")

In [None]:
# Plot configurations
plt.rcParams.update({
    'font.size': 12,
    'axes.titlesize': 20,
    'figure.figsize': (24,15)
})

## 1.2 Custom functions

In [None]:
def confusion_matrix(model,X,y):
  matrix = plot_confusion_matrix(model,X,y,normalize='true',cmap=plt.cm.Reds)
  matrix.ax_.set_title('Confusion Matrix')
  plt.xlabel('Predicted Label')
  plt.ylabel('True Label')
  plt.gcf().set_size_inches(15,8)
  plt.show()

# 2.0 Exploratory Data Analysis (EDA) 


## 2.1 Missing data and data info


In [None]:
df.head()

In [None]:
# Lets see if there is missing data first
is_missing = df.isnull()
sns.heatmap(is_missing)

In [None]:
df.info() 

In [None]:
# Lets see if the variable 'quality' has an even distribution of data
sns.countplot(df['quality'])

The classes 3,4 and 8 are much lower than 5 6 and 7. Lets see how much we have in each class



In [None]:
df['quality'].value_counts()

We will have to correct class imbalance later

## 2.2 Plotting the variables 

In [None]:
y = df['quality']
X = df.drop(['quality'],axis=1)
X_cols = X.columns


X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.80,random_state=0) 

Lets boxplot all variables and see if our variables are skewed and have lots of outliers

In [None]:
for i in range(11):
  plt.subplot(3,4,i+1)
  sns.boxplot(y=X[X.columns[i]],x=y)
plt.tight_layout()

The most weird outliers due to their large range are given by the following variables:

- chlorides
- sulphates
- residual sugar 

Is there a reason to eliminate them from our prediction model? I don't know since I´m not an expert it the wine industry to see something bizarre. However, we can test the model accuracy with and without outliers to see the accuracy and decide. 

Another option is to transform the data to treat these outliers




In [None]:
# Since no data is missing, lets look at the summary of the dataset and see if we find something weird 
display(df.describe())
print(df.shape)
print(df.skew())

It seems that most of our data is skewed, which is bad for the ML algorithm. Lets see if we can put a Gaussian distribution on top of them


In [None]:
for i in range(11):
  plt.subplot(3,4,i+1)
  sns.distplot(df[df.columns[i]])
plt.tight_layout()

It is important for our data to have a Gaussian Distribution, which is not the case for most of the variables seen above.

These variables must be scaled while also dealing with the outliers present in the dataset


In [None]:
corrMatrix = df.corr(method='spearman') # spearman coefficient can be used on numerical and ordinal variables (quality is ordinal). Pearson only has meaning for numerical values. 
g = sns.clustermap(corrMatrix, cmap="coolwarm", linewidths=1,annot=True,vmin = -1.0 , vmax=1.0, cbar_kws={"ticks":[-1,-0.5,0,0.5,1]}) 

g.fig.set_figwidth(15)
g.fig.set_figheight(11)

# Clustermaps do hierarchical clustering and orders the rows and columns based on similarity, making it easier to see correlations

Correlation maps only has meaning when looking at numerical values, so lets ignore to variable 'quality' and look for correlations between the independent variables

For the dependent variable, we can see that the most important variables for the model are likely to be:
- alcohol (0.48)
- sulphates (0.38)
- volatile acidity (-0.38)

There is also the chance that the below variables will be important if we were more linient on the choices:
- citric acid (0.21)
- total sulfur dioxide (0.2)
- chlorides (-0.19)
- density (-0.18)

No analysis on the independent variables was done. 




# 3.0 Reference Model

We will create a model without any preprocessing to have a reference of the precision


In [None]:
random_forest = RandomForestClassifier(n_estimators=100, random_state=0)
random_forest.fit(X_train, y_train)
pred = random_forest.predict(X_test)

cross_val = cross_val_score(random_forest,X_train,y_train,cv=2,scoring="accuracy")

print('Test MAE :\n', mean_absolute_error(y_test,pred))
print('Train accuracy score:\n',random_forest.score(X_train, y_train))
print('Test accuracy score:\n',random_forest.score(X_test,y_test))
print('Cross validation accuracy:\n',cross_val.mean())

In [None]:
print(classification_report(y_test,pred))

In [None]:
confusion_matrix(random_forest,X_test,y_test)

# 4.0 Preprocessing the data for classification problem


We shall apply transforms to the variables to see how it improves the model. In this case, we shall try 3 things:

- Scale the outliers and run the model
- Remove the more extreme outliers and run the model
- Transform the data to a more Gaussian distribution and run the model
- Do all of the above 



## 4.1 Scaling the outliers 

A scaler was chosen to scale the distribution in such a way that it is robust to outliers

In [None]:
robust_scaler = RobustScaler(quantile_range=(25,100))
X_train_scaled = robust_scaler.fit_transform(X_train)
X_test_scaled = robust_scaler.transform(X_test)

X_train_scaled_aux = pd.DataFrame(X_train_scaled,columns=X_cols)

In [None]:
for i in range(11):
  plt.subplot(4,3,i+1)
  sns.boxplot(y=X_train_scaled_aux[X_train_scaled_aux.columns[i]],x=y_train)
plt.tight_layout()

# Observe the y axis values

In [None]:
for i in range(11):
  plt.subplot(4,3,i+1)
  sns.distplot(X_train_scaled_aux[X_train_scaled_aux.columns[i]])
plt.tight_layout()

In [None]:
display(X_train.skew())                 # before tranformation
display(X_train_scaled_aux.skew())      # after transformation

In [None]:
random_forest = RandomForestClassifier(n_estimators=100, random_state=0)
random_forest.fit(X_train_scaled, y_train)
pred = random_forest.predict(X_test_scaled)

cross_val = cross_val_score(random_forest,X_train_scaled, y_train,cv=2,scoring="accuracy")

print('Test MAE :\n', mean_absolute_error(y_test,pred))
print('Train accuracy score:\n',random_forest.score(X_train_scaled, y_train))
print('Test accuracy score:\n',random_forest.score(X_test_scaled,y_test))
print('Cross validation accuracy:\n',cross_val.mean())

In [None]:
print(classification_report(y_test,pred))

In [None]:
confusion_matrix(random_forest,X_test_scaled,y_test)

No considerable improvement was shown

## 4.2 Removing  some outliers

In [None]:
Q1 = df.quantile(q=.25)
Q3 = df.quantile(q=.75)
IQR = df.loc[ : , df.columns != 'quality'].apply(stats.iqr)


#only keep rows in dataframe that have values within 2.5*IQR of Q1 and Q3
df_cleaned = df[~((df < (Q1-2.5*IQR)) | (df > (Q3+2.5*IQR))).any(axis=1)]

#find how many rows are left in the dataframe 
print(df_cleaned.shape)
print(df.shape)

In [None]:
y_cleaned = df_cleaned['quality']
X_cleaned = df_cleaned.drop(['quality'],axis=1)
X_cols = X.columns


X_train,X_test,y_train,y_test = train_test_split(X_cleaned,y_cleaned,train_size=0.80,random_state=0) 

In [None]:
for i in range(11):
  plt.subplot(4,3,i+1)
  sns.boxplot(y=X_train[X_train.columns[i]],x=y_train)
plt.tight_layout()

In [None]:
for i in range(11):
  plt.subplot(4,3,i+1)
  sns.distplot(X_train[X_train.columns[i]])
plt.tight_layout()

By removing only the more extreme outliers, the data distribution is a lot more gaussian for most of the variables and the skew has decreased


In [None]:
display(X_train.skew())  # Considerable reduced skew compared to using all the outliers

In [None]:
random_forest = RandomForestClassifier(n_estimators=100, random_state=0)
random_forest.fit(X_train, y_train)
pred = random_forest.predict(X_test)

cross_val = cross_val_score(random_forest,X_train, y_train,cv=2,scoring="accuracy")

print('Test MAE :\n', mean_absolute_error(y_test,pred))
print('Train accuracy score:\n',random_forest.score(X_train, y_train))
print('Test accuracy score:\n',random_forest.score(X_test,y_test))
print('Cross validation accuracy:\n',cross_val.mean())

In [None]:
print(classification_report(y_test,pred))

In [None]:
confusion_matrix(random_forest,X_test,y_test)

Slight increase in test and training accuracy and improvement for the least present labels. Therefore we will keep this way of dealing with outliers.

There is no need to do robust scaling since the extreme outliers were removed. 

## 4.3 Fixing the skew of the data

In [None]:
power = PowerTransformer(method='yeo-johnson', standardize=True)
X_train_power = power.fit_transform(X_train)
X_test_power = power.transform(X_test)

X_train_power_aux = pd.DataFrame(X_test_power,columns=X_cols)

In [None]:
for i in range(11):
  plt.subplot(4,3,i+1)
  sns.distplot(X_train_power_aux[X_train_power_aux.columns[i]],bins=10)
plt.tight_layout()

In [None]:
display(X_train.skew())                 # before tranformation
display(X_train_power_aux.skew())       # after transformation

With the exception of alcohol and citric acid, we can consider the data to be relatively gaussian with enough reduced skew to use in the model

In [None]:
random_forest = RandomForestClassifier(n_estimators=100, random_state=0)
random_forest.fit(X_train_power, y_train)
pred = random_forest.predict(X_test_power)

cross_val = cross_val_score(random_forest,X_train_power, y_train,cv=2,scoring="accuracy")

print('Test MAE :\n', mean_absolute_error(y_test,pred))
print('Train accuracy score:\n',random_forest.score(X_train_power, y_train))
print('Test accuracy score:\n',random_forest.score(X_test_power,y_test))
print('Cross validation accuracy:\n',cross_val.mean())

In [None]:
print(classification_report(y_test,pred))

In [None]:
confusion_matrix(random_forest,X_test_power,y_test)

No considerable improvement by decreasing the skew

## 4.4 Fixing class imbalance

Since, theoretically, we must test the model with real data values, SMOTE should only applied to the training dataset. 

The test dataset should maintain the proportions of the classes since it is more realistic

In [None]:
y_train.value_counts()

In [None]:
#sampling_strategy={3: 250,
#                   4: 300,
#                   8: 400,
#                   }

In [None]:
oversample = SMOTE(random_state=0,

                   k_neighbors=2
                   )
X_resampled,y_resampled = oversample.fit_resample(X_train_power,y_train)

In [None]:
sns.countplot(y_resampled)

In [None]:
pd.DataFrame(y_resampled).value_counts()

In [None]:
random_forest = RandomForestClassifier(n_estimators=100, random_state=0)
random_forest.fit(X_resampled, y_resampled)
pred = random_forest.predict(X_test_power)

cross_val = cross_val_score(random_forest,X_resampled, y_resampled,cv=5,scoring="accuracy")

print('Test MAE :\n', mean_absolute_error(y_test,pred))
print('Train accuracy score:\n',random_forest.score(X_resampled, y_resampled))
print('Test accuracy score:\n',random_forest.score(X_test_power,y_test))
print('Cross validation accuracy:\n',cross_val.mean())

In [None]:
print(classification_report(y_test,pred))

In [None]:
confusion_matrix(random_forest,X_test_power,y_test)

- Precision of class 8 has reduced considerably 
- Overall accuracy is lower
- Cross validation accuracy is higher due to the oversampling of the least represented categories 


Conclusion: SMOTE does not help this type of classification problem due to the lack of dataset points for the least represented classes. SMOTE needs to use the 'k_neighbors' parameter lower than the number of the least represented classes. In this case, the lowest number is 3 from class 8, meaning my k_neighbors can only be 2 or lower. If these points are very far apart from each other, the artificial data doesn´t represent the class, which explains the reduced accuracy.

Cros validation accuracy increased because of the more present classes

# 5.0 Processing the data for binary classification problem

In [None]:
bins = (2, 6.5, 8) # Suggested by the main page of the problem
target_groups = ['Bad', 'Good']
df['quality'] = pd.cut(df['quality'], bins = bins, labels = target_groups)

In [None]:
label_quality = LabelEncoder()
df['quality'] = label_quality.fit_transform(df['quality'])

In [None]:
df['quality'].value_counts()

In [None]:
sns.countplot(df['quality'])

In [None]:
# Plot configurations
plt.rcParams.update({
    'font.size': 12,
    'axes.titlesize': 20,
    'figure.figsize': (24,15)
})

## 5.1 Reference Model


In [None]:
y = df['quality']
X = df.drop(['quality'],axis=1)
X_cols = X.columns


X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.80,random_state=0) 

In [None]:
random_forest = RandomForestClassifier(n_estimators=100, random_state=0)
random_forest.fit(X_train, y_train)
pred = random_forest.predict(X_test)

cross_val = cross_val_score(random_forest,X_train,y_train,cv=5,scoring="accuracy")

print('Test MAE :\n', mean_absolute_error(y_test,pred))
print('Train accuracy score:\n',random_forest.score(X_train, y_train))
print('Test accuracy score:\n',random_forest.score(X_test,y_test))
print('Cross validation accuracy:\n',cross_val.mean())

In [None]:
print(classification_report(y_test,pred))

In [None]:
confusion_matrix(random_forest,X_test,y_test)

## 5.2 Removing outliers


In [None]:
Q1 = df.quantile(q=.25)
Q3 = df.quantile(q=.75)
IQR = df.loc[ : , df.columns != 'quality'].apply(stats.iqr)


#only keep rows in dataframe that have values within 2*IQR of Q1 and Q3
df_cleaned = df[~((df < (Q1-1.5*IQR)) | (df > (Q3+1.5*IQR))).any(axis=1)]

#find how many rows are left in the dataframe 
print(df_cleaned.shape)
print(df.shape)

In [None]:
y_cleaned = df_cleaned['quality']
X_cleaned = df_cleaned.drop(['quality'],axis=1)

X_train,X_test,y_train,y_test = train_test_split(X_cleaned,y_cleaned,train_size=0.80,random_state=24) 

In [None]:
for i in range(11):
  plt.subplot(4,3,i+1)
  sns.boxplot(y=X_train[X_train.columns[i]],x=y_train)
plt.tight_layout()

In [None]:
for i in range(11):
  plt.subplot(4,3,i+1)
  sns.distplot(X_train[X_train.columns[i]])
plt.tight_layout()

In [None]:
display(X_train.skew())  

In [None]:
random_forest = RandomForestClassifier(n_estimators=100, random_state=0)
random_forest.fit(X_train, y_train)
pred = random_forest.predict(X_test)

cross_val = cross_val_score(random_forest,X_train, y_train,cv=5,scoring="accuracy")

print('Test MAE :\n', mean_absolute_error(y_test,pred))
print('Train accuracy score:\n',random_forest.score(X_train, y_train))
print('Test accuracy score:\n',random_forest.score(X_test,y_test))
print('Cross validation accuracy:\n',cross_val.mean())

In [None]:
print(classification_report(y_test,pred))

In [None]:
confusion_matrix(random_forest,X_test,y_test)

High precision but pretty bad recall for the 1 (Good) label


## 5.3 Fixing the skew of the data 

In [None]:
power = PowerTransformer(method='yeo-johnson', standardize=True)
X_train_power = power.fit_transform(X_train)
X_test_power = power.transform(X_test)

X_train_power_aux = pd.DataFrame(X_test_power,columns=X_cols)

In [None]:
for i in range(11):
  plt.subplot(4,3,i+1)
  sns.distplot(X_train_power_aux[X_train_power_aux.columns[i]],bins=10)
plt.tight_layout()

In [None]:
display(X_train.skew())                 # before tranformation
display(X_train_power_aux.skew())       # after transformation

In [None]:
random_forest = RandomForestClassifier(n_estimators=100, random_state=0)
random_forest.fit(X_train_power, y_train)
pred = random_forest.predict(X_test_power)

cross_val = cross_val_score(random_forest,X_train_power, y_train,cv=5,scoring="accuracy")

print('Test MAE :\n', mean_absolute_error(y_test,pred))
print('Train accuracy score:\n',random_forest.score(X_train_power, y_train))
print('Test accuracy score:\n',random_forest.score(X_test_power,y_test))
print('Cross validation accuracy:\n',cross_val.mean())

In [None]:
print(classification_report(y_test,pred))

In [None]:
confusion_matrix(random_forest,X_test_power,y_test)

Decreasing the skew had no impact on model precision

## 5.4 Fixing class imbalance

In [None]:
y_train.value_counts()

In [None]:
sampling_strategy={1: 800
                   }

In [None]:
oversample = SMOTE(random_state=0,sampling_strategy=sampling_strategy,k_neighbors=5)
X_resampled,y_resampled = oversample.fit_resample(X_train_power,y_train)

In [None]:
sns.countplot(y_resampled)

In [None]:
pd.DataFrame(y_resampled).value_counts()

In [None]:
random_forest = RandomForestClassifier(n_estimators=100, random_state=0)
random_forest.fit(X_resampled, y_resampled)
pred = random_forest.predict(X_test_power)

cross_val = cross_val_score(random_forest,X_resampled, y_resampled,cv=5,scoring="accuracy")

print('Test MAE :\n', mean_absolute_error(y_test,pred))
print('Train accuracy score:\n',random_forest.score(X_resampled, y_resampled))
print('Test accuracy score:\n',random_forest.score(X_test_power,y_test))
print('Cross validation accuracy:\n',cross_val.mean())

In [None]:
print(classification_report(y_test,pred))

In [None]:
confusion_matrix(random_forest,X_test_power,y_test)

 SMOTE has shown to be sucessful in increasing model performance for the recall metric



## 5.5 Model optimization


In [None]:
print('Parameters currently in use:\n')
pprint(random_forest.get_params())

In [None]:
random_grid = {"max_depth": [None],
              "max_features": [3, 5, 10],
              "min_samples_split": [2, 5, 10],
              "min_samples_leaf": [1, 5, 10],
              "bootstrap": [False, True],
              "n_estimators" :[100,500],
              "criterion": ["gini"]}


pprint(random_grid)

In [None]:
%%time 
search_param_forest = RandomizedSearchCV(random_forest,random_grid,cv=5,verbose=3,n_jobs=-1,n_iter=5)
search_param_forest.fit(X_resampled,y_resampled)

In [None]:
search_param_forest.best_score_

In [None]:
search_param_forest.best_params_

In [None]:
search_param_forest.best_estimator_

In [None]:
random_forest = RandomForestClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=3,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

random_forest.fit(X_resampled, y_resampled)
pred = random_forest.predict(X_test_power)

cross_val = cross_val_score(random_forest,X_resampled, y_resampled,cv=5,scoring="accuracy")

print('Test MAE :\n', mean_absolute_error(y_test,pred))
print('Train accuracy score:\n',random_forest.score(X_resampled, y_resampled))
print('Test accuracy score:\n',random_forest.score(X_test_power,y_test))
print('Cross validation accuracy:\n',cross_val.mean())

In [None]:
print(classification_report(y_test,pred))

In [None]:
confusion_matrix(random_forest,X_test_power,y_test)

# 6.0 Selecting most important variables for the model

From the correlation heatmap, we have the expectation of chosen variables below:

For the dependent variable, we can see that the most important variables for the model are likely to be:
- alcohol (0.48)
- sulphates (0.38)
- volatile acidity (-0.38)

There is also the chance that the below variables will be important if we were more linient on the choices:
- citric acid (0.21)
- total sulfur dioxide (0.2)
- chlorides (-0.19)
- density (-0.18)


## 6.1 Model feature importance


In [None]:
random_forest.feature_importances_

In [None]:
#The importance of a feature is computed as the (normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance.

In [None]:
feat_importances = pd.Series(random_forest.feature_importances_,index=X.columns)
feat_importances.nlargest(5).plot(kind='barh')
plt.show()

The expectation from the correlation map is confirmed by the model

# 7.0 Conclusion

- Scaling the variables using RobustScaler was ineffective
- Removing some of the outliers showed slight improvement in the model
- The data was not skewed enough after removing the more critical outliers to make yeo-johnson scaling show considerable model improvement.

Final procedure: remove most critical outliers -> apply yeo-johnson -> apply SMOTE


### Classification problem model improvement
- Train accuracy score: 1.0  -> 1.0 
- Test accuracy score: 0.7125 ->  0.682
- Cross validation accuracy: 0.652 -> 0.894

- The test score got worse while the cross validation score got better do to SMOTE. however, due to the few number of member of the least present classes, this method was inefective, as discussed in section 4.4. 

- The reason the cross validation got better was due to the presence of the artificial terms in the least represented classes. The most likely reason why the artificial data caused a decrease in test accuracy is because the original data points are not close to one another, therefore, the artifical data points are not in a clustered area. 

## Binary classification model improvement
- Train accuracy score: 1.0 -> 1.0
- Test accuracy score:  0.93125 -> 0.954
- Cross validation accuracy: 0.899 ->  0.9504
- SMOTE was effective in this case due to the higher number of classes 'Good' and it is likely that their location is more close to one another.



## Top 5 most important features:
- alcohol                 0.234330
- sulphates               0.164203
- volatile acidity        0.109635
- citric acid             0.088512
- density                 0.074837
