In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
import warnings
warnings.filterwarnings('ignore')
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load Data

In [None]:
df= pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
df.head()

In [None]:
df.info()

In [None]:
df.describe()

# Exploratory Data Analysis

In [None]:
sns.countplot(df['quality'])

In [None]:
i=1
plt.figure(figsize=(15,20))
for col in df.columns:
    plt.subplot(6,2,i)
    sns.distplot(df[col])
    i+=1
    

In [None]:
i=1
plt.figure(figsize=(15,20))
for col in df.columns:
    plt.subplot(6,2,i)
    sns.barplot(x=df['quality'], y= df[col])
    i+=1

In [None]:
sns.pairplot(df, hue='quality')

# Dividing the Dataset
* Since the total amount of data we have is very low, we will divide our wines as good, or bad.
* All wines with quality 7 or above are good, rest are bad

In [None]:
df['Quality']=0
df.loc[df['quality']>6, 'Quality']=1
df.head()

In [None]:
sns.countplot(df['Quality'])

In [None]:
df['Quality'].value_counts()
df.drop('quality', axis=1, inplace=True)

# Oversampling
The data is still skewed so we use Sklearn's resample option to generate more data from existing data

In [None]:
from sklearn.utils import resample, shuffle

zero= df[df['Quality']==0]
ones= df[df['Quality']==1]

upsampled= resample(ones, replace=True, n_samples=zero.shape[0])

df_new= pd.concat([zero, upsampled])
df_new= shuffle(df_new)

In [None]:
sns.countplot(df_new['Quality'])

In [None]:
sns.pairplot(df_new, hue='Quality')

# Scaling 

In [None]:
from sklearn.model_selection import train_test_split
X= df_new.drop('Quality', axis=1)
y= df_new['Quality']

X_train, X_test, y_train,y_test= train_test_split(X,y, test_size=0.2)

In [None]:
from sklearn.preprocessing import StandardScaler

ss=StandardScaler()

X_train= ss.fit_transform(X_train.values)
X_test= ss.transform(X_test.values)
X_train= pd.DataFrame(X_train, columns= X.columns)
X_test=pd.DataFrame(X_test, columns=X.columns)
X_train.head()

In [None]:
X_test.head()

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')

# Training our Models

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

In [None]:
key= ['KNeighborsClassifier', 'LogisticRegression', 'RandomForestClassifier', 'GaussianNB', 'DecisionTreeClassifier', 'XGBClassifier', 'SVC']
value= [KNeighborsClassifier(), LogisticRegression(), RandomForestClassifier(), GaussianNB(), DecisionTreeClassifier(), XGBClassifier(), SVC()]

models= dict(zip(key,value))

In [None]:
training_scores= []
testing_scores=[]

for key, value in models.items():
    value.fit(X_train, y_train)
    train_score= value.score(X_train,  y_train)
    test_score= value.score(X_test, y_test)
    training_scores.append(train_score)
    testing_scores.append(test_score)
    
    print(f"{key}\n")
    print(f"Training Score: {train_score}" )
    print(f"Testing Score: {test_score} \n")

In [None]:
from sklearn.model_selection import cross_val_score
cv_scores= []

for key, value in models.items():
    cvs=cross_val_score(value, X,y, cv=5)
    
    cv_scores.append(cvs.mean())
    print(f"{key}\n")
    print(f"CV Score: {cvs.mean()} \n" )

# Hyperparameter Tuning
RandomForestClassifier seems to be the best model for this data

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

rfc=RandomForestClassifier(random_state=42)
rfc.fit(X_train,y_train)
y_pred= rfc.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test,y_pred))

In [None]:
from sklearn.model_selection import GridSearchCV

params= {'n_estimators':[10,100,200],
        'max_depth':[10,50,100,150],
        'min_weight_fraction_leaf':[0, 0.1,0.01],
        'ccp_alpha':[0, 0.01,0.1]}

grid=GridSearchCV(rfc, param_grid=params, cv=5, verbose=1)

In [None]:
grid.fit(X,y)

In [None]:
grid.best_params_

# Final Results

In [None]:
best= grid.best_estimator_
best.fit(X_train, y_train)
y_pred=best.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test,y_pred))

# Upvote and Comment if you liked my notebook :)