In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
df.head()

### Basic EDA

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
sns.countplot(df['quality']);

In [None]:
df.columns

### Quality based on alcohol content

In [None]:
g = sns.FacetGrid(df, hue="quality",height=5)
g = g.map(sns.distplot, "alcohol")
plt.legend();

### Quality based on pH value

In [None]:
g = sns.FacetGrid(df, hue="quality",height=5)
g = g.map(sns.distplot, "pH")
plt.legend();

### ANOVA method

In [None]:
from statsmodels.formula.api import ols      # For calculation of Ordinary least squares for ANOVA
from statsmodels.stats.anova import _get_covariance,anova_lm # For n-way ANOVA
from statsmodels.stats.multicomp import pairwise_tukeyhsd # For performing the Tukey-HSD test
from statsmodels.stats.multicomp import MultiComparison # To compare the levels  independent variables with the 
import scipy.stats as stats 

In [None]:
df_melt = pd.melt(df.reset_index(), id_vars=['index'], value_vars=['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar','chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density','pH', 'sulphates', 'alcohol','quality'])

In [None]:
df_melt.head()

In [None]:
df_melt['variable'].unique()

In [None]:
df_melt.columns = ['index', 'treatments', 'value']

### Calculate OLS model(ordinary least square )

In [None]:
model = ols('value ~ C(treatments)', data=df_melt).fit()
anova_table = anova_lm(model, typ=2)
anova_table
#Type 1 2 and 3 yield same result if the data is balanced

 F values is less than 0.05 which means of all groups means are not equal.Data is statistically significant

In [None]:
### lets check for quality and alcohol

In [None]:
formula = 'alcohol ~ C(quality)'
model = ols(formula, df).fit()
aov_table = anova_lm(model)
aov_table

In [None]:
formula = 'pH ~ C(quality)'
model = ols(formula, df).fit()
aov_table = anova_lm(model)
aov_table

In [None]:
sns.pointplot(x='quality', y='alcohol', data=df,ci=0.95,color='g');
sns.pointplot(x='quality', y='pH', data=df,ci=0.95,color='r');

In [None]:
#Causal relation bwetween pH and quality
mc = MultiComparison(df['pH'], df['quality'])
mc_results = mc.tukeyhsd(alpha=0.05)
print(mc_results)

In [None]:
#causal relation bwetween alcohol and quality
mc = MultiComparison(df['alcohol'], df['quality'])
mc_results = mc.tukeyhsd(alpha=0.05)
print(mc_results)

*certain group means are not equal as per tukey HSD test for alcohol and PH.
Can be done similarly for remaining column with respect to quality.*

## Linear Regression Assumptions

## Thumb rules to help interpret goodness of fit in Regression model
1. R-sq / Adj R-sq shows Goodness of fit. More favorable to have higher value (0-1)
2. Prob (F-statistic) Less than Alpha Reject
3. Log-Likelihood: - Goodness of fit (Higher the better when comparing multiple models)
4. AIC(Akaike's Information Criterion),BIC(Bayesian Information Criterion):- Goodness of fit (Lower the better the when comparing multiple models)

## Assumptions tested
1. Omnibus/Prob(Omnibus) – a test of the skewness and kurtosis of the residual Omnimbus preferably closer to Zero & Prob(Omnibus) preferably closer to 1
2. Skew – a measure of data symmetry. We want to see something close to zero, indicating the residual distribution is normal.
3. Kurtosis – a measure of "peakiness", or curvature of the data. Higher peaks lead to greater Kurtosis. Greater Kurtosis can be interpreted as a tighter clustering of residuals around zero, implying a better model with few outliers
4. Durbin-Watson – tests for Auto correlation We hope to have a value between 1.5 and 2.5
5. Jarque-Bera (JB)/Prob(JB) – like the Omnibus test in that it tests both skew and kurtosis.
6. Condition Number – This test measures the sensitivity of a function's output as compared to its input. When we have multicollinearity, we can expect much higher fluctuations to small changes in the data, hence, we hope to see a relatively small number.

In [None]:
import statsmodels.api as sm

In [None]:
X = df.drop('quality',axis=1)
y = df['quality']

In [None]:
model = sm.OLS(y, X).fit()
predictions = model.predict(X)

In [None]:
print(model.summary())

In [None]:
from scipy.stats import pearsonr
from statsmodels.compat import lzip
import statsmodels.stats.api as sms
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
df[df.columns].corr(method='pearson')

In [None]:
plt.figure(figsize=(15,6))
sns.heatmap(df.corr(method='pearson'),annot=True);

### Variance Inflation Factor

In [None]:
#Formulae = (1/1-R^2)
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["features"] = X.columns

In [None]:
vif
# All VIF values are different , if two or more than two values have same VIF or variability the 

In [None]:
sns.residplot(predictions,y-predictions);

horizontal bands show homoscedasticity

### Test for checking Homoscadesticity is the Goldfeldquandt test

In [None]:
name = ['GQ', 'p-value']
test = sms.het_goldfeldquandt(y-predictions,X)
lzip(name, test)
#failed to reject null hypothesis so data is homoscedastic

### Normal Distirbution of Error term

In [None]:
from scipy.stats import shapiro
shapiro(np.abs(y-predictions))
# Error term is normally distributed as it rejects the null hypothesis

In [None]:
res = model.resid
fig = sm.qqplot(res,fit=True,line='45')
plt.show()
##Red line denotes normal line
##blue dots are the error terms

## Model Evaluation

#### R Square/Adjusted R Square

* 98.7% of dependent variability explained by this model
* Adj. R-squared:0.987
* R-squared:0.987


### Mean Square Error(MSE)/Root Mean Square Error(RMSE)

In [None]:
from sklearn.metrics import mean_squared_error
import math
print('MSE',mean_squared_error(y,predictions))
print('RMSE',math.sqrt(mean_squared_error(y,predictions)))

### mean Absolute Error

In [None]:
from sklearn.metrics import mean_absolute_error
print('MAE',mean_absolute_error(y,predictions))