In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# IMPORT MODULES:

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')

# LOADING THE DATASET:

In [None]:
df = pd.read_csv('../input/wine-quality/winequalityN.csv')

In [None]:
df.head()

In [None]:
# statistical info
df.describe()

In [None]:
# datatype info
df.info()

# PREPROCESSING THE DATA:

In [None]:
# null values:
df.isnull().sum()

In [None]:
# filling the missing values:
for col , value in df.items():
    if col != 'type': # type col -> object datatype
        df[col] = df[col].fillna(df[col].mean())

In [None]:
df.isnull().sum()

# EXPLORATORY DATA ANALYSIS:

In [None]:
# box plots:
fig, ax = plt.subplots(ncols=6, nrows=2, figsize=(20,10))
index = 0
ax = ax.flatten()
for col, value in df.items():
    if col != 'type':
        sns.boxplot(y=col, data=df, ax=ax[index])
        index += 1
plt.tight_layout(pad=0.5, w_pad=0.7, h_pad=5.0)

In [None]:
fig, ax = plt.subplots(ncols=6, nrows=2, figsize=(20,10))
index = 0
ax = ax.flatten()

for col, value in df.items():
    if col != 'type':
        sns.distplot(value, ax=ax[index])
        index += 1
plt.tight_layout(pad=0.5, w_pad=0.7, h_pad=5.0)

In [None]:
# log transformation:
df['free sulfur dioxide'] = np.log(1 + df['free sulfur dioxide'])

In [None]:
sns.distplot(df['free sulfur dioxide'])

In [None]:
sns.countplot(df['type'])

In [None]:
sns.countplot(df['quality'])

# COORELATION MATRIX:

In [None]:
corr = df.corr()

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(corr , annot = True )

# INPUT SPLIT:

In [None]:
X = df.drop(columns=['type', 'quality'])

In [None]:
y = df['quality']

# CLASS IMBALANCEMENT:

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
y.value_counts()

In [None]:
oversample = SMOTE(k_neighbors=4)

In [None]:
X, y = oversample.fit_resample(X, y)

In [None]:
y.value_counts()

# MODEL TRAINING:

In [None]:
from sklearn.model_selection import cross_val_score, train_test_split

In [None]:
def classify(model, X, y):
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    model.fit(x_train, y_train)
    print("Accuracy:", model.score(x_test, y_test) * 100)
    
    score = cross_val_score(model, X, y, cv=5)
    print("CV Score:", np.mean(score)*100)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model = LogisticRegression()

In [None]:
classify(model, X, y)

In [None]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
classify(model, X, y)

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
classify(model, X, y)

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
classify(model, X, y)

In [None]:
import xgboost as xgb
model = xgb.XGBClassifier()
classify(model, X, y)

In [None]:
import lightgbm 
model = lightgbm.LGBMClassifier()
classify(model, X, y)