In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Importing Libraries

In [None]:
# importing libraries
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn.utils import resample
from sklearn.model_selection import *
from sklearn.preprocessing import *
from sklearn.ensemble import *
from sklearn.decomposition import PCA
from sklearn.metrics import *

# 2. Importing Dataset

In [None]:
# file path of quality wine dataset
file_path = os.path.join(dirname, filename)

In [None]:
# importing dataframe
df = pd.read_csv(file_path , sep =',')

# 3. Exploring Data #

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
# creating new column with total of acidity
df['total acidity'] = df['fixed acidity'] + df['volatile acidity']

# plotting correlation matrix of features
matrix_corr = df.corr()

sns.heatmap(matrix_corr,
            xticklabels = matrix_corr.columns,
            yticklabels = matrix_corr.columns , cmap = 'YlGnBu' )

In [None]:
# plotting the count of labels
sns.countplot(x='quality', data=df)
# unbalanced dataset

In [None]:
# separating dataset with majority count, 5 and 6 quality score
max_samples = len(df[df['quality']== 5])

# majority dataset
df_maj = df[(df['quality'] == 5) | (df['quality'] == 6)]

In [None]:
# spliting dataset per quality score
df_q3 = df[df['quality'] == 3]
df_q4 = df[df['quality'] == 4]
df_q7 = df[df['quality'] == 7]
df_q8 = df[df['quality'] == 8]

# resampling minoritary datasets
dfm3 = resample(df_q3 ,
                replace = True,
                n_samples= max_samples,
                random_state=0)

dfm4 = resample(df_q4 ,
                replace = True,
                n_samples= max_samples,
                random_state=0)

dfm7 = resample(df_q7 ,
                replace = True,
                n_samples= max_samples,
                random_state=0)

dfm8 = resample(df_q8 ,
                replace = True,
                n_samples= max_samples,
                random_state=0)

# creating dataset balaced
df = pd.concat([df_maj , dfm3 , dfm4 , dfm7 , dfm8])

In [None]:
# creating PCA features
features = df.drop('quality' , axis = 1)
label = df['quality']

# normalize dataset features
scaler_atr = StandardScaler()

atb = scaler_atr.fit_transform(features)

X = np.matrix(atb)
S = np.cov(np.transpose(X)) 

pca = PCA(n_components=8)

pca.fit(X)

components = np.round(pca.explained_variance_ratio_ , 2)

pca_1 = pca.transform(X)[:,0]
pca_2 = pca.transform(X)[:,1]
pca_3 = pca.transform(X)[:,2]
pca_4 = pca.transform(X)[:,3]
pca_5 = pca.transform(X)[:,4]
pca_6 = pca.transform(X)[:,5]
pca_7 = pca.transform(X)[:,6]
pca_8 = pca.transform(X)[:,7]

# applying in dataset
df['PCA1'] = pca_1
df['PCA2'] = pca_2
df['PCA3'] = pca_3
df['PCA4'] = pca_4
df['PCA5'] = pca_5
df['PCA6'] = pca_6
df['PCA7'] = pca_7
df['PCA8'] = pca_8

In [None]:
# showing the new correlation matrix 
matrix_corr = df.corr()
matrix_corr = matrix_corr['quality'].sort_values(ascending=False)
print(matrix_corr)

In [None]:
# choosing features
features = df[['PCA2','PCA3','alcohol','volatile acidity','sulphates',
               'citric acid','total sulfur dioxide','density','chlorides',
               'fixed acidity','PCA1','PCA4','PCA5','PCA6',
               'PCA7','PCA8','total acidity','pH']]

In [None]:
# splinting dataset in train and test
train_features, test_features, train_labels, test_labels = train_test_split(features , label, 
                                                                            test_size = 0.20, 
                                                                            random_state = 0)

In [None]:
# creating dict with params
param_grid = [{'n_estimators':[20,40,45,50,55,60,70,100,150,200,250,300,350,400],
               'max_depth':[7,8,9,10,11,12,13,15,16,17,18,19,20,22,25,30,35,40],
               'criterion':['gini','entropy']}]

In [None]:
# creating Randon Forest Classifier to train the model
clf = RandomForestClassifier()

In [None]:
# creating exhaustive search over specified parameter values for an estimator
gs = GridSearchCV(clf, param_grid = param_grid, scoring='accuracy', cv=3)

In [None]:
# train the params of dict: param_grid
gs.fit(train_features, train_labels)

In [None]:
# best params
print(gs.best_params_)

In [None]:
# creating classifier with best params found
clf = RandomForestClassifier(criterion = gs.best_params_['criterion'],
                             max_depth = gs.best_params_['max_depth'],
                             n_estimators = gs.best_params_['n_estimators'])

In [None]:
# trainning with best params found in gs
clf.fit(train_features, train_labels)

In [None]:
# applying model in test dataset
predictions = clf.predict(test_features)

In [None]:
# evaluating the model
acc = sklearn.metrics.accuracy_score(test_labels, predictions)
print('Accuracy: ', acc)