In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import HTML
import base64

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

from skmultilearn.model_selection import iterative_train_test_split
from skmultilearn.model_selection import IterativeStratification

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC, SVR
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline

In [None]:
df_train = pd.read_csv("../input/cafecombyterwinequality/train.csv")
df_test = pd.read_csv("../input/cafecombyterwinequality/test.csv")

In [None]:
df_train.describe()

In [None]:
df_train.head(5)

In [None]:
plt.subplots(figsize=(15,10))
correlation = df_train.corr()
sns.heatmap(correlation, xticklabels=correlation.columns, yticklabels=correlation.columns, annot=True, cmap=sns.diverging_palette(220, 20, as_cmap=True))

In [None]:
# Verify database balancing
qualities_count = df_train.groupby("quality").count()["id"]
classes = qualities_count.index
qualities_count

In [None]:
cols = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides',
        'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']

# Normalizing features
# df_train_normalized = StandardScaler().fit_transform(df_train[cols])
df_train_normalized = (df_train[cols]-df_train[cols].min())/(df_train[cols].max()-df_train[cols].min())
pd.DataFrame(df_train_normalized).head()

In [None]:
# Split train / test dataset
x_train, x_test, y_train, y_test = train_test_split(df_train_normalized, df_train["quality"], test_size=0.1)

# Training model
model = RandomForestClassifier()
model.fit(x_train, y_train)

# Results
y_pred = model.fit(x_train, y_train).predict(x_test)
pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).T

In [None]:
data = confusion_matrix(y_test, y_pred)
heatmap = plt.pcolor(data, cmap='PuBu_r')
heatmap.axes.invert_yaxis()
for y in range(data.shape[0]):
    for x in range(data.shape[1]):
        plt.text(x + 0.5, y + 0.5, '%d' % data[y, x],
            horizontalalignment='center',
            verticalalignment='center',
        )

plt.colorbar(heatmap)
plt.show()

In [None]:
predict_test = model.predict(df_test.drop(columns=["id"]))
submission = pd.DataFrame()
submission["id"] = df_test["id"]
submission["quality"] = predict_test
submission.head()

In [None]:
def create_download_link(df, title = "Download CSV file", filename = "data.csv"):  
    csv = df.to_csv(index=False)
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)
create_download_link(submission)