****Import Necessary Libraries

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import seaborn as sns

**Read & Load the file

In [None]:
Data=pd.read_csv("../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv")

Data.rename(columns ={'residual sugar':'residual_sugar'},inplace=True)

View Data 

In [None]:
Data.head()

Check attributes

In [None]:
Data.info()

In [None]:
print("The Number Of Rows and Columns:",Data.shape)
print("Data Size:",Data.size)

Check for Null values

In [None]:
Data.isnull().sum()

Five POINT SUMMARY 

In [None]:
Data.describe().T

Let's see the range of quality

In [None]:
sns.countplot(Data['quality']);

In [None]:
sns.distplot(Data['quality']);

****Lets see the correlation between attributes

In [None]:
def plot_corr(Data, size=15):
    corr = Data.corr()
    fig, ax = plt.subplots(figsize=(size, size))
    ax.matshow(corr)
    plt.xticks(range(len(corr.columns)), corr.columns)
    plt.yticks(range(len(corr.columns)), corr.columns)

In [None]:
plot_corr(Data)

BOX PLOTS to see the outliers

In [None]:
plt.figure(figsize= (3,3))
sns.boxplot(x=Data['fixed acidity'],color='orange')
plt.figure(figsize= (3,3))
sns.boxplot(x=Data['pH'],color='orange')
plt.figure(figsize= (3,3))
sns.boxplot(x=Data['alcohol'],color='orange')
plt.figure(figsize= (3,3))
sns.boxplot(x=Data['sulphates'],color='orange')
plt.figure(figsize= (3,3))
sns.boxplot(x=Data['citric acid'],color='orange')

In [None]:
Data.skew()

> Adding a column to classify wine quality which can be later used for Logistic Regression

In [None]:
def func(row):
    if row["quality"] > 6.5:
        return("Good")
    else:
        return("Bad")
Data["quality_change"]=Data.apply(func,axis=1)
Data.groupby('quality_change')['quality'].sum().plot.pie(autopct='%1.2f%%');

# Lets see the significance of Alcohol in quality of the wine 

In [None]:
sns.scatterplot(x=Data['pH'],y=Data['alcohol'],hue=Data['quality_change']);

In [None]:
import scipy.stats as stats

H0="Alcohol does have an impact on the quality of wine as the P_value is greater than 0.05 :"
Ha="Alcohol does NOT have any significant impact on the quality of wine, as the P_value is less than 0.05 :"
Good_quality_Wine_OH=np.array(Data[Data.quality_change =='Good'].alcohol)
Bad_quality_wine_OH=np.array(Data[Data.quality_change =='Bad'].alcohol)
t, p_value  = stats.ttest_ind(Good_quality_Wine_OH,Bad_quality_wine_OH,axis=0)
p_value
if p_value < 0.5:
    print(Ha,format(p_value))
else:
    print(H0,format(p_value))

In [None]:
Ho="Residual Sugars have a significant role in quality of alchohol"
Ha="Residual Sugars do not have significance on the quality of alcohol" 

Good_Quality_Wine_Sugar=np.array(Data[Data.quality_change=='Good'].residual_sugar)
Bad_Quality_Wine_Sugar=np.array(Data[Data.quality_change=='Bad'].residual_sugar)

f_stat,p_value=stats.f_oneway(Good_Quality_Wine_Sugar,Bad_Quality_Wine_Sugar)
if p_value < 0.05:
    print(Ha,"since P_value is less than 0.05 with a value {}:".format(p_value))
else:
    print(Ho,"since p_value is greater than 0.05 with a value of:{}".format(p_value))
    
sns.scatterplot(x=Data['alcohol'],y=Data['quality'],hue=Data['residual_sugar']);

> ****Let's do a linear regression to see the dependency of quality with other variables, For this exercise our variable of interest would be quality and our independent variables would be other attributes other than quality.

In [None]:
Data_For_Linear=Data.drop(['quality_change'],axis=1)
Data_For_Linear.head()

In [None]:
X_Linear=Data_For_Linear.drop(['quality'],axis=1)
y_Linear=Data_For_Linear['quality']

In [None]:
X_Linear_train,X_Linear_test,y_Linear_train,y_Linear_test=train_test_split(X_Linear,y_Linear,test_size=0.3, random_state=1)

In [None]:
from sklearn.linear_model import LinearRegression
qual_linear=LinearRegression()

In [None]:
qual_linear.fit(X_Linear_train,y_Linear_train)

In [None]:
#qual_linear.coef_=pd.DataFrame(qual_linear.coef_,X_Linear.columns,columns=['Coefficients'])
qual_linear.coef_
for idx,col_name in enumerate(X_Linear_train.columns):
    print("The coefficient for {} is {}".format(col_name,qual_linear.coef_[idx]))

In [None]:
qual_linear.intercept_

In [None]:
Linear_Pred=qual_linear.predict(X_Linear_test)

In [None]:
df = pd.DataFrame({"Actual": y_Linear_test, "Predicted": Linear_Pred})
Top_25= df.head(25)

In [None]:
Top_25.plot(kind='bar',figsize=(15,10))
plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')

****As we can see , our Linear model has almost done the predictions but it is not that much accurate

In [None]:
qual_linear.score(X_Linear_train,y_Linear_train)

In [None]:
qual_linear.score(X_Linear_test,y_Linear_test)

In [None]:
from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(y_Linear_test,Linear_Pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_Linear_test,Linear_Pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_Linear_test, Linear_Pred)))

**Now, Lets perform logistic regression on this same dataset, but our goal is to find out with all given parameters if a wine sample is Good or Bad. So, our variable of interest in this scenario is "quality_chage" which I added to the dataset based on the quality column from the original data. I am dropiing the quality column here as it may not be required for this model.****

In [None]:
Data_for_Logistic=Data.copy()
Data_for_Logistic.drop(['quality'],axis=1,inplace=True)
Data_for_Logistic.head()

Spliting Our data using train_test_spli

In [None]:
X=Data_for_Logistic.drop(['quality_change'],axis=1)
y=Data_for_Logistic['quality_change']

In [None]:
X_Train,X_test,y_Train,y_test=train_test_split(X,y,test_size=0.3, random_state=1)

In [None]:
X_test.head()

In [None]:
LOG_REG=LogisticRegression(solver="liblinear")
LOG_REG.fit(X_Train,y_Train)


In [None]:
for idx, col_name in enumerate(X_Train.columns):
    print("The coefficient for {} is {}".format(col_name, LOG_REG.coef_[0][idx]))

In [None]:
LOG_REG.intercept_

**Lets calculate the accuracy score for our model**

In [None]:
y_pred_log=LOG_REG.predict(X_test)

from sklearn.metrics import accuracy_score
print(accuracy_score(y_pred_log, y_test)*100)

> **Though we have a accuracy score of 88.54% , In a classification problem, it is better to see the other metrics such as recall, precision,f1 score and ROC to ensure our model is good.

Let's create the confusion matrix 

In [None]:
from sklearn import metrics
cm=metrics.confusion_matrix(y_test,y_pred_log,labels=["Good", "Bad"])
cm

# **From our confusion matrix what we infer :**
* True  Positives: 15 (These are predicted numbers of good quality wine which were really good)
* True Negatives: 410 (These are predicted numbers of bad quality wine which were really bad)
* False Positives: 40 (These are predicted numbers of good quality wine which were really bad or the type 1 error)
* False Negatives: 15 (These are predicted numbers of bad quality wine which were really good or the type 2 error)


In [None]:
from sklearn.metrics import classification_report,confusion_matrix,mean_squared_error

print(classification_report(y_test,y_pred_log))

Planning to further add ROC curve score, Fpr,tpr and AUC etc. Please share your thoughts and valuble comments