In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

* **`Importing the WINE DATA and Eyeballing the Data`**

In [None]:
df_wine = pd.read_csv("/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv")

* What might be an interesting thing to do, is aside from using regression modelling, is to set an arbitrary cutoff for your dependent variable (wine quality) at e.g. 7 or higher getting classified as 'good/1' and the remainder as 'not good/0'.
This allows you to practice with hyper parameter tuning on e.g. decision tree algorithms looking at the ROC curve and the AUC value.
Without doing any kind of feature engineering or overfitting you should be able to get an AUC of .88 (without even using random forest algorithm)`

In [None]:
df_wine.info()

* **The Data shows that the data has is having appropriate Datatypes and has no null values**

`Lets Check the min max and median of the data `

In [None]:
df_wine.describe()

* After  looking at the Data we can see that the mean and median(50%) of the density is normally distributed

* We are converting the following problem into the Classification Problem 
    
    `Wine Quality>7 is Good `
    
    `Wine Quality<7 is not Good`

* Lets Create a feature 

    `0 for the Wine quality<7` 

    `1 for the Wine quality>=7`


* Check the Value counts for the Quality in the dataframe

In [None]:
df_wine["quality"].value_counts()

* **We are converting the Problem in the classification problem**

    `Creating the Feature Wine_Cat by distinguishing into the quality`

In [None]:
wine_cat=[]
for i in df_wine["quality"]:
    if i >=7:
        wine_cat.append(1)
    else:
        wine_cat.append(0)
df_wine["wine_cat"]=wine_cat

* **` Check the value_counts for 0 and 1`** 

In [None]:
df_wine.wine_cat.value_counts()

#### Lets create a full model on the Data using Classification Algorithm and checking the Accuracy metrics

* **Assinging the Df_predictor and Df_Target**   

In [None]:
df_Predictors=df_wine.drop(["quality","wine_cat"],axis=1)
df_target=df_wine.wine_cat

* **Creating the train-test Split on the data and check for the shape of the Train test split**

In [None]:
Xtrain,Xtest,ytrain,ytest=train_test_split(df_Predictors,df_target,random_state=10,test_size=0.2)
print("Shape of Xtrain:{} and Shape of ytrain:{} ".format(Xtrain.shape,ytrain.shape))
print("Shape of Xtest:{} and Shape of ytest:{} ".format(Xtest.shape,ytest.shape))

### Logistic Regression 

In [None]:
logit_regression=sm.Logit(ytrain,Xtrain).fit()
logit_regression.summary()

* **The Features with pvalues less than 0.05 are to considered significant hence the Features below are significant**

       

**`volatile acidity`
`chlorides`
`total sulfur dioxide`
`density`
`sulphates`
`alcohol`** 

**We got Pseudo Rsquared as 0.2871**

**Which seems to be good Fit**

In [None]:
# we are creating a different Performance metrics for the Logistic Regression giving different threshold

In [None]:
score_card = pd.DataFrame(columns=['Probability Cutoff', 'AUC Score', 'Precision Score', 'Recall Score',
                                       'Accuracy Score', 'Kappa Score', 'f1-score'])
def update_score_card(model, cutoff):
    from sklearn import metrics
    # let 'y_pred_prob' be the predicted values of y
    y_pred_prob = logit_regression.predict(Xtest)
    
    # convert probabilities to 0 and 1 using 'if_else'
    y_pred = [ 0 if x < cutoff else 1 for x in y_pred_prob]

# assign 'score_card' as global variable
    global score_card

# append the results to the dataframe 'score_card'
# 'ignore_index = True' do not consider the index labels
    score_card = score_card.append({'Probability Cutoff': cutoff,
                                    'AUC Score' : metrics.roc_auc_score(ytest, y_pred),
                                    'Precision Score': metrics.precision_score(ytest, y_pred),
                                    'Recall Score': metrics.recall_score(ytest, y_pred),
                                    'Accuracy Score': metrics.accuracy_score(ytest, y_pred),
                                    'Kappa Score':metrics.cohen_kappa_score(ytest, y_pred),
                                    'f1-score': metrics.f1_score(ytest, y_pred)}, 
                                    ignore_index = True)

In [None]:

update_score_card(logit_regression,0.2)
update_score_card(logit_regression,0.4)
update_score_card(logit_regression,0.6)

update_score_card(logit_regression,0.8)

In [None]:
score_card

In [None]:
y_pred_prob=logit_regression.predict(Xtest)


* **As we know that the Logistic Regression gives us the Probability we need to convert the Probabilities to `0 and 1`**

In [None]:
ypred=[1 if x>0.5 else 0 for x in y_pred_prob]


#### The Following is the confusion Matrix for Logistic Regression

In [None]:
from sklearn import metrics
sns.heatmap(metrics.confusion_matrix(ytest,ypred),annot=True,annot_kws={"size":25},fmt="d")
plt.show()

**This is the  List Comprehension for Converting the ypredicted_probalities to 0 and 1**

### Naive Bayes Classification
* Lets Use the Naive Bayes Classification and  check the  Accuracy

* We Import Naives Bayes from Gaussian NB

In [None]:
Naive_bayes=GaussianNB()
Naive_bayes_model=Naive_bayes.fit(X=Xtrain,y=ytrain)
NBpred=Naive_bayes.predict(Xtest)

* Getting the confusion Matrix for the Bayesian Classification with the Help of Heatmap

In [None]:
from sklearn import metrics
sns.heatmap(metrics.confusion_matrix(ytest,NBpred),annot=True,annot_kws={"size":25},fmt="d")
plt.show()

### Random Forest Classification

* We are taking the Random Forest Classification with Hyperparameter n_estimator=1000 that is we want 1000 trees 

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:

rf=RandomForestClassifier(n_estimators=1000)
modelrf=rf.fit(Xtrain,ytrain)
rf_predict=rf.predict(Xtest)
metrics.accuracy_score(ytest,rf_predict)
sns.heatmap(metrics.confusion_matrix(ytest,rf_predict),annot=True,annot_kws={"size":25},linewidths=0.2,fmt='d',cmap="viridis")
plt.show()

### K-Nearest Neighbors
*  We are using the n_neighbors=5 which tells us the value of top 5 nearest Neighbors to be considered

In [None]:
from sklearn.neighbors import KNeighborsClassifier
KNN=KNeighborsClassifier(n_neighbors=5)
KNN=KNN.fit(Xtrain,ytrain)
Knn_predict=KNN.predict(Xtest)


In [None]:
metrics.accuracy_score(ytest,Knn_predict)

In [None]:
def accuracy(ytest,pred):
    return metrics.accuracy_score(ytest,pred)*100

    
print("The Accuracy for the Logistic regression is {},\nThe Accuracy for the 1000 Random forest is {} and \nThe Accuracy for the Bayesian classifier is {}\n The Accuracy for the K-Nearest Neighbors is {}".format(accuracy(ytest,ypred),accuracy(ytest,rf_predict),accuracy(ytest,NBpred),accuracy(ytest,Knn_predict)))

* We Have Created the following Models without dropping the Features we will generate the Model using K fold Cross Validation after the Model Is built