# Importing the libraries

In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
import pylab
import scipy.stats as stats
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import binarize
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

%matplotlib inline

# Loading the data

In [None]:
data = pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
data.head(10)

# Exploratory data analysis

In [None]:
data['quality'].value_counts()    #Finding the count of the response variable

In [None]:
data['Quality'] = data['quality'].apply(lambda x:'1' if x>6 else '0')     #Converting the response variable to Binary

In [None]:
data.drop(['quality'],axis=1,inplace=True)    # drop the previous column

In [None]:
data.head(10)

In [None]:
data.isna().sum()       #Checking for null values

In [None]:
data.describe()

# Checking for outliers

In [None]:
plt.boxplot(data['total sulfur dioxide'])
plt.show()

In [None]:
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1
print(IQR)            #Finding Inter Quartile range

In [None]:
data = data[~((data < (Q1 - 1.5 * IQR)) |(data > (Q3 + 1.5 * IQR))).any(axis=1)]
data.reset_index(inplace=True,drop=True)          # Removing the outliers

In [None]:
data.Quality.value_counts()

In [None]:
147/(1047+147)

In [None]:
index = list(range(0,12))
columns = list(data.columns[0:11])
dictc = dict(zip(index,columns))
print(dictc)          # creating column names

# Splitting the data and standardizing the data

In [None]:
x = data.drop(['Quality'],axis=1)
y = data['Quality']
scaler = StandardScaler()
scaler.fit(x)
x= scaler.transform(x)
x = pd.DataFrame(x)
x=x.rename(columns=dictc)

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=42)
#len(x_test),len(x_train),len(y_train),len(y_test)
y_train = np.ravel(y_train)

# Running a Naive Logistic regression test

In [None]:
logit1 = sm.Logit(y.astype(float),sm.add_constant(x.astype(float))).fit()
print(logit1.summary())

# Predicting the test and train sets

In [None]:
logit = LogisticRegression(solver='lbfgs')
logit.fit(x_train,y_train)
predict = logit.predict(x_test)
predictt = logit.predict(x_train)

In [None]:
prob_test = logit.predict_proba(x_test)[:,1]
#print(prob_test.reshape(1,-1))

In [None]:
prob_train = logit.predict_proba(x_train)[:,1]
#print(prob_train.reshape(1,-1))

In [None]:
accuracy_score(y_test,predict)

In [None]:
confusion_matrix(y_test,predict)

In [None]:
print(classification_report(y_test,predict))

# Dropping all the variables with no significance

In [None]:
data.drop(['fixed acidity'],axis=1,inplace=True)

In [None]:
data.drop(['residual sugar'],axis=1,inplace=True)

In [None]:
data.drop(['density'],axis=1,inplace=True)

In [None]:
data.drop(['pH'],axis=1,inplace=True)

In [None]:
data.drop(['chlorides'],axis=1,inplace=True)

In [None]:
data.drop(['citric acid'],axis=1,inplace=True)

In [None]:
data.drop(['free sulfur dioxide'],axis=1,inplace=True)

In [None]:
x = data.drop(['Quality'],axis=1)
y = data['Quality']


# Running the model after removing the non significant variables

In [None]:
logit1 = sm.Logit(y.astype(float),sm.add_constant(x.astype(float))).fit()
print(logit1.summary())

# Predicting the train and test sets with the new model

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=42)
len(x_test),len(x_train),len(y_train),len(y_test)

In [None]:
logit = LogisticRegression(solver='lbfgs')
logit.fit(x_train,y_train)
predict = logit.predict(x_test)
predictt = logit.predict(x_train)

In [None]:
prob_test = logit.predict_proba(x_test)[:,1]
#print(prob_test.reshape(1,-1))

In [None]:
prob_train = logit.predict_proba(x_train)[:,1]
#print(prob_train.reshape(1,-1))

In [None]:
print("accuracy:",accuracy_score(y_test,predict))

In [None]:
print("confusion matrix \n",confusion_matrix(y_test,predict))

In [None]:
print(classification_report(y_test,predict))

In [None]:
roc_auc_train = roc_auc_score(y_train,predictt)
fpr, tpr, threshold = roc_curve(pd.to_numeric(y_train),prob_train)
roc_auc = auc(fpr,tpr)

In [None]:
plt.figure()
plt.plot(fpr,tpr,color ='blue',label='ROC curve(area= %0.2f)'%(roc_auc))
plt.plot([0,1],[0,1],'r--')
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title("Receiver Operating Characteristic : Train Data")
plt.legend(loc='lower right')
plt.show()

In [None]:
roc_auc_test = roc_auc_score(y_test,predict)
fpr1, tpr1, threshold1 = roc_curve(pd.to_numeric(y_test),prob_test)
roc_auc1 = auc(fpr1,tpr1) 

In [None]:
plt.figure()
plt.plot(fpr1,tpr1,color ='blue',label='ROC curve(area= %0.2f)'%(roc_auc1))
plt.plot([0,1],[0,1],'r--')
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title("Receiver Operating Characteristic : Test data")
plt.legend(loc='lower right')
plt.show()

# Trying out different cut-off values

In [None]:
y_predict = binarize(prob_test.reshape(1,-1),0.25)[0]
y_predict = y_predict.astype(int)

In [None]:
confusion_matrix(pd.to_numeric(y_test),y_predict)

In [None]:
print(classification_report(pd.to_numeric(y_test),y_predict))

In [None]:
y_predict = binarize(prob_test.reshape(1,-1),0.50)[0]
y_predict = y_predict.astype(int)

In [None]:
confusion_matrix(pd.to_numeric(y_test),y_predict)

In [None]:
print(classification_report(pd.to_numeric(y_test),y_predict))

In [None]:
y_predict = binarize(prob_test.reshape(1,-1),0.75)[0]
y_predict = y_predict.astype(int)

In [None]:
confusion_matrix(pd.to_numeric(y_test),y_predict)

In [None]:
print(classification_report(pd.to_numeric(y_test),y_predict))

# Finding the best cutoff value

In [None]:
i = np.arange(len(tpr1))
roc = pd.DataFrame({'fpr':pd.Series(fpr1,index =i),'tpr':pd.Series(tpr1,index=i),
                   '1-fpr':pd.Series(1-fpr1,index=i),'tf':pd.Series(tpr1 - (1-fpr1),index=i),
                   'thresholds':pd.Series(threshold1,index=i)})
roc.iloc[(roc.tf-0).abs().argsort()[:1]]

In [None]:
plt.figure
plt.plot(roc['tpr'],color='red')
plt.plot(roc['1-fpr'],color='blue')
plt.xlabel('1-Flase Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.show()

In [None]:
y_predict = binarize(prob_test.reshape(1,-1),0.16)[0]
y_predict = y_predict.astype(int)

In [None]:
print("accuracy:",accuracy_score(pd.to_numeric(y_test),pd.to_numeric(y_predict)))

In [None]:
print("confusion matrix:\n",confusion_matrix(pd.to_numeric(y_test),y_predict))

In [None]:
print(classification_report(pd.to_numeric(y_test),y_predict))