In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
wn_df=pd.read_csv('/kaggle/input/wine-quality/winequalityN.csv')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

### Encode type variable

In [None]:
wn_df.type=wn_df.type.apply(lambda x: 0 if x=="red" else 1)

### EDA

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(wn_df.corr(),annot=True)
plt.show()

In [None]:
def univariate(var):
    sns.boxplot(data=wn_df,y=var)
    plt.show()

In [None]:
wn_df.columns

In [None]:
univariate('fixed acidity')

In [None]:
univariate('alcohol')

In [None]:
univariate('volatile acidity')

In [None]:
univariate('citric acid')

In [None]:
univariate('residual sugar')

In [None]:
univariate('chlorides')

In [None]:
univariate('free sulfur dioxide')

In [None]:
univariate('total sulfur dioxide')

In [None]:
univariate('density')

In [None]:
univariate('sulphates')

In [None]:
univariate('pH')

In [None]:
univariate('alcohol')

#### There are some outliers in numerical variables .Lets remove them.

In [None]:
for col in ['fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']:
    wn_df=wn_df[wn_df[col]< wn_df[col].quantile(0.99)]

In [None]:
wn_df.groupby("type")['type'].count().plot.pie(autopct="%.1f%%",labels=['red','white'])

In [None]:
wn_df.groupby("quality")['quality'].count().plot.pie(autopct="%.1f%%", pctdistance=0.5)

### Bivariate analysis

In [None]:
wn_df['quality'].describe()

### Numerical varaibles vs target variable

In [None]:
for col in ['fixed acidity', 'volatile acidity', 'citric acid',
       'residual sugar', 'chlorides', 'free sulfur dioxide',
       'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']:
    
    fig,ax =plt.subplots(1,3, figsize=(18,5))
    plt.subplots_adjust(hspace = 1)
    sns.barplot(data=wn_df,y=col,x="quality",ax=ax[0])
   
    sns.lineplot(data=wn_df,y=col,x="quality",ax=ax[1])
    sns.violinplot(data=wn_df, y=col, x='quality',ax=ax[2])
    #plt.show()
    

##### Observation: From the above bivariate analysis we can see there is some strong relation exists in below numerical varaibles vs target varaible
     - Volatile acidity
     - Citric acid
     - Chlorides
     - Density
     - Alcohol

In [None]:
wn_df_v1=wn_df

In [None]:
wn_df_v1.info()

#### Fill missing values using mean method

In [None]:
wn_df_v1.chlorides=wn_df_v1.chlorides.fillna(wn_df_v1.chlorides.mean())

In [None]:
wn_df_v1['volatile acidity']=wn_df_v1['volatile acidity'].fillna(wn_df_v1['volatile acidity'].mean())

In [None]:
wn_df_v1['fixed acidity']=wn_df_v1['fixed acidity'].fillna(wn_df_v1['fixed acidity'].mean())

In [None]:
wn_df_v1['citric acid']=wn_df_v1['citric acid'].fillna(wn_df_v1['citric acid'].mean())

In [None]:
wn_df_v1['residual sugar']=wn_df_v1['residual sugar'].fillna(wn_df_v1['residual sugar'].mean())

In [None]:
wn_df_v1['sulphates']=wn_df_v1['sulphates'].fillna(wn_df_v1['sulphates'].mean())

In [None]:
wn_df_v1['pH']=wn_df_v1['pH'].fillna(wn_df_v1['pH'].mean())

#### Model building using xgbhoost

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
Y=wn_df_v1.pop('quality')
X=wn_df_v1

#### Handle imbalance data to avoid overfitting

In [None]:
from imblearn.over_sampling import SMOTE
sm=SMOTE(k_neighbors=4)

X,Y=sm.fit_resample(X,Y)

In [None]:
X

#### Create train and test data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


In [None]:
model = XGBClassifier(radom_state=42)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
print("Accuracy: %.2f%%" % (accuracy_score(y_pred,y_test) * 100.0))

### The accuracy with xgbhoost is around 87%

### Model building with Random Forest classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier()

In [None]:
rfc.fit(X_train, y_train)

In [None]:
y_pred_rc = rfc.predict(X_test)

In [None]:
print("Accuracy: %.2f%%" % (accuracy_score(y_pred_rc,y_test) * 100.0))

In [None]:
y_pred_rc_train = model.predict(X_train)

### The  accuracy with Rqndom forest is 88% which is better than xgbhoost model