# The analysis contains three parts: 
# 1. data preparation
# 2. fearure and model selection
# 3. model finishing

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

df = pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
print(df.head())
print('Data frame shape',df.shape)

sns.countplot(x=df['quality'])

----------------------------------------------------------------------------------------------------
# The duplicated rows are cleaned in this part.

In [None]:
# calculate duplicates
print('Data frame shape before duplicate cleaning', df.shape)
dups = df.duplicated()
# report if there are any duplicates
print('\nany duplicates:',dups.any())
# list all duplicate rows
#print('\nlist all duplicate rows:',df[dups])

# delete duplicate rows
df.drop_duplicates(inplace=True)

print('Data frame shape after duplicate cleaning',df.shape)


----------------------------------------------------------------------------------------------------
# A test for a normality/ a normal distribution of the data shows the data are not normal distrubuted and a transformation is needed.

In [None]:
from scipy.stats import normaltest

print(df.shape)
# normality test
for ix in df.columns.values:
    stat, p = normaltest(df[ix])
    print(ix)
    print('Statistics=%.3f, p=%.3f' % (stat, p))
    # interpret
    alpha = 0.05
    if p > alpha:
     print('Sample looks Gaussian (fail to reject H0)')
    else:
     print('Sample does not look Gaussian (reject H0)')

In [None]:
# summarize the number of unique values in each column
print('\nnumber of unique values in each column:',print(df.nunique()))

# summarize the number of unique values in each column
for ix in df.columns.values:
    num = len(np.unique(df[ix]))
    percentage = float(num) / df.shape[0] * 100
    print('{}, {}, {}%'.format(ix, num, percentage))

----------------------------------------------------------------------------------------------------
# Graphic of the raw data distribution.

In [None]:
# histogram of the data
plt.figure(figsize=(10, 6))
dfg = df.values
dfg = dfg[:, :-1]
for ix in range(dfg.shape[1]):
    plt.subplot(3,4,ix+1)
    plt.hist(dfg[:,ix], bins=25)
plt.grid(True)
plt.tight_layout()
plt.show()

----------------------------------------------------------------------------------------------------
# A test for normality is preformed with statmodels and the results are the same as the test preformed with scipy.

In [None]:
import statsmodels.stats.diagnostic as ssd

#### NORMALITY TEST
print(df.shape)
for ix in df.columns.values:
    p_val=ssd.kstest_normal(df[ix].values)
    print('p-value for {} = {:.4f}'.format(ix,p_val[1]))

Prepation of the input and output.

In [None]:
# retrieve the array of data
data = df.values
# separate into input and output columns
X = data[:, :-1]
y = data[:, -1]

print('X',X[0:2,:])
print('y',y[0:2])

----------------------------------------------------------------------------------------------------
# A cleaning of the outliers with a IsolationForest method.

In [None]:
from sklearn.ensemble import IsolationForest

# summarize the shape of the training dataset
print('\ndataset before outlier cleaning:',X.shape, y.shape)
# identify outliers
iforest = IsolationForest()
yhat = iforest.fit_predict(X)
# select all rows that are not outliers
mask = yhat != -1
X, y = X[mask, :], y[mask]
# summarize the shape of the updated training dataset
print('dataset after outlier cleaning:',X.shape, y.shape)

----------------------------------------------------------------------------------------------------
# Data normalization is performed.

In [None]:
from sklearn.preprocessing import PowerTransformer

X = np.delete(X, np.s_[7], axis=1) 

# power transform the raw data
power = PowerTransformer(method='yeo-johnson', standardize=True)
X_trans = power.fit_transform(X)

# histogram of the transformed data
plt.figure(figsize=(10, 6))
for ix in range(X_trans.shape[1]):
    plt.subplot(3,4,ix+1)
    plt.hist(X_trans[:,ix], bins=25)
plt.grid(True)
plt.tight_layout()
plt.show()


----------------------------------------------------------------------------------------------------
# It looks better but the normality test shows that the data isn't normal distributed after the transformation. 
# This means the results from the analysis may be not valid.

In [None]:
#### NORMALITY TEST
print(X_trans.shape)
print(X_trans.shape[1])
for ix in range(X_trans.shape[1]):
    p_val=ssd.kstest_normal(X_trans[:,ix])
    print('p-value for {} = {:.4f}'.format(ix,p_val[1]))

----------------------------------------------------------------------------------------------------
# The multi-colinearity is removed from the data set.

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# remove the colinearity from X
for i in np.arange(0,X_trans.shape[1]):
    vif = [variance_inflation_factor(X_trans, ix) for ix in range(X_trans.shape[1])]
    maxloc = vif.index(max(vif))
    print('maxloc',maxloc)
    if max(vif) > 10:
        #print('vif :', vif)
        print('dropping at index:  ' + str(maxloc))
        #del list_factors[maxloc]
        X_trans = np.delete(X_trans, np.s_[maxloc], axis=1) 
    else:
        break
#print('Final variables:', list_factors)
print(X_trans.shape)

----------------------------------------------------------------------------------------------------
# Feature selection and model selection is performed in one step. 
# f_classif or mutual_info_classif is used for feature selection wuth every algorithm.
# The models performance are measured with negative log loss and  accuracy.

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score

# define the evaluation method
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=200)

results = []
logloss_resu = []
names = []

# Algorithms
models = []
models.append(('LR', LogisticRegression(solver='liblinear')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto',probability=True)))



# define the pipeline to evaluate
for name,model in models:
    fs = SelectKBest()
    pipeline = Pipeline(steps=[('anova',fs), ('model', model)])
    # define the grid
    grid = {'anova__k':[i+1 for i in range(X.shape[1])],'anova__score_func':[f_classif,mutual_info_classif]}
    # define the grid search
    search = RandomizedSearchCV(pipeline, grid, scoring='neg_log_loss', n_jobs=-1, cv=cv)
    # perform the search
    result = search.fit(X_trans, y)

    print('\nModel name:',name)

    means = search.cv_results_['mean_fit_time']
    stds = search.cv_results_['std_fit_time']
    params = search.cv_results_['mean_score_time']
    timem = search.cv_results_['std_score_time']

    # measure calculation time
    #for mean, stdev, param in zip(means, stds, params):
        #print("mean_fit_time:%f std_fit_time:%f  mean_score_time:%f std_score_time:%f" % (mean.sum(), stdev.sum(), param.sum(),timem.sum()))

    logloss_results = cross_val_score(search.best_estimator_,X_trans,y, cv=cv,scoring='neg_log_loss')
    logloss_resu.append(logloss_results)

    cv_results = cross_val_score(search.best_estimator_,X_trans,y, cv=cv,scoring='accuracy')
    results.append(cv_results)

    names.append(name)
    # summarize best

    print('Best Mean Log-loss: %.3f' % result.best_score_)
    print('Best Config: %s' % result.best_params_)
    msg = "accuracy -> %s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

# Compare Algorithms
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(121)
ax.set_title('accuracy')
plt.boxplot(results)
ax.set_xticklabels(names)
ax.set_ylim(0, 1)
ax = fig.add_subplot(122)
ax.set_title('neg_log_loss')
plt.boxplot(logloss_resu)
ax.set_xticklabels(names)
plt.tight_layout()
plt.show()

----------------------------------------------------------------------------------------------------
# Enseble methods are used for the analysis in this section.

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from scipy.stats import randint

# ensembles
ensembles = []
ensembles.append(('AB', AdaBoostClassifier()))
ensembles.append(('GBM', GradientBoostingClassifier()))
ensembles.append(('RF', RandomForestClassifier()))
ensembles.append(('ET', ExtraTreesClassifier()))
results = []
logloss_resu = []
names = []
for name, model in ensembles:
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=200)
    fs = SelectKBest(score_func=f_classif)
    pipeline = Pipeline(steps=[('anova',fs), ('model', model)])
    # define the grid
    grid = {'anova__k':[i+1 for i in range(X.shape[1])],'model__n_estimators':randint(10,400)}
    # define the grid search
    search = RandomizedSearchCV(pipeline, grid, scoring='neg_log_loss', n_jobs=-1, cv=cv, random_state=500)
    search.fit(X_trans, y)
    print('\n',name)
    print ('Best Parameters: ', search.best_params_)

    cv_results = cross_val_score(search.best_estimator_, X_trans, y, cv=cv, scoring='accuracy')
    logloss_results = cross_val_score(search.best_estimator_, X_trans, y, cv=cv, scoring='neg_log_loss')
    results.append(cv_results)
    logloss_resu.append(logloss_results)
    names.append(name)
    msg = "accuracy -> %s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
    msg = "logloss -> %s: %f (%f)" % (name, logloss_results.mean(), logloss_results.std())
    print(msg)

# Compare Algorithms
fig = plt.figure()
fig.suptitle('Ensemble Algorithm Comparison')
ax = fig.add_subplot(121)
ax.set_title('accuracy')
plt.boxplot(results)
ax.set_xticklabels(names)
ax.set_ylim(0, 1)
ax = fig.add_subplot(122)
ax.set_title('neg_log_loss')
plt.boxplot(logloss_resu)
ax.set_xticklabels(names)
plt.tight_layout()
plt.show()

----------------------------------------------------------------------------------------------------
# Model finishing: the performance of the models is checked.

In [None]:
from sklearn.metrics import confusion_matrix

# feature selection
def select_features(X, y):
    fs = SelectKBest(score_func=f_classif, k=9)
    fs.fit(X, y)
    X = fs.transform(X)
    return X

# define the evaluation method
X_trans = select_features(X_trans, y)


cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=200)
model = ExtraTreesClassifier(n_estimators = 337)
model.fit(X_trans,y)
cv_results = cross_val_score(model, X_trans, y, cv=cv, scoring='accuracy')
msg = "%s: %f (%f)" % ('GMB', cv_results.mean(), cv_results.std())
print(msg)
