In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/wine-quality/winequalityN.csv')
df.head()

# load dataset

In [None]:
df.describe()

In [None]:
df.info()

# Preprocessing the dataset

In [None]:
#check for null values
df.isnull().sum()

In [None]:
#fill the missing values
for col, value in df.items():
    if col != 'type':
        df[col] = df[col].fillna(df[col].mean())

In [None]:
df.isnull().sum()

# Exploratory analysis

In [None]:
#create boxplot
fig,ax = plt.subplots(ncols=6, nrows = 2,figsize =(20,10))
index=0
ax = ax.flatten()

for col,value in df.items():
    if col!='type':
        sns.boxplot(y=col,data=df,ax = ax[index])
        index+=1
plt.tight_layout(pad=0.5,w_pad=0.7,h_pad=5.0)
        

In [None]:
#create displot
#fig, ax = plt.subplots(ncols= 6,nrows =2, figsize= (20,10))
index =0
ax = ax.flatten()
for col,value in df.items():
    if col!= 'type':
        sns.displot(value, ax = ax[index])
        index+=1
plt.tight_layout(pad=0.5,w_pad=0.7,h_pad=5.0)

In [None]:
#log tranformation
df['free sulfur dioxide_log']= np.log(df['free sulfur dioxide'])

In [None]:
sns.displot(df['free sulfur dioxide_log'])

In [None]:
df['alcohol_log']= np.log(df['alcohol'])
sns.displot(df['alcohol_log'])

In [None]:
df['total sulfur dioxide_log']= np.log(df['total sulfur dioxide'])
sns.displot(df['total sulfur dioxide_log'])

In [None]:
df['volatile acidity_log']= np.log(df['volatile acidity'])
sns.displot(df['volatile acidity_log'])

In [None]:
df['residual sugar_log']= np.log(df['residual sugar'])
sns.displot(df['residual sugar_log'])

In [None]:
df.head()

In [None]:
col = ['residual sugar','alcohol','free sulfur dioxide','volatile acidity','total sulfur dioxide']
df = df.drop(columns = col, axis =1)
df.head()

In [None]:
sns.countplot(df['type'])

In [None]:
sns.countplot(df['quality'])

# correlation matrix

In [None]:
df.corr()

In [None]:
plt.figure(figsize = (12,10))
sns.heatmap(df.corr(), annot = True)

# input split

In [None]:
x = df.drop(columns = ['type','quality'])
y = df['quality']


# Class Imbalancement

In [None]:
y.value_counts()

In [None]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE(k_neighbors=4)
# TRANSFORM THE DATASET
x,y = oversample.fit_resample(x,y)

In [None]:
y.value_counts()

# model training

In [None]:
#classify function
from sklearn.model_selection import cross_val_score, train_test_split
def classify(model,x,y):
    x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=34)
    #train model
    model.fit(x_train,y_train)
    print('Accuracy ', model.score(x_test,y_test)*100)
    
    #cross validation
    score = cross_val_score(model,x,y,cv=5)
    print('CV Score ', np.mean(score)*100)

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
classify(model,x,y)

In [None]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
classify(model,x,y)

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
classify(model,x,y)

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
classify(model,x,y)

In [None]:
from xgboost import XGBClassifier
model = XGBClassifier()
classify(model,x,y)

In [None]:
import lightgbm
model = lightgbm.LGBMClassifier()
classify(model,x,y)
