In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline

In [None]:
dataset=pd.read_csv("../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv")
df=dataset.copy()

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
df.nunique()

## From above, we can observe that
    1.There are 12 cols and 1599 data rows.
    2.The data does not contain any null values.
    3.There are 6 different qualities of wine.

In [None]:
df['quality'].value_counts()

In [None]:
plt.figure(figsize=(12,5))
sns.countplot(df['quality'])

### The data is very less for quality 3,4,8 as compared to 5,6.

In [None]:
plt.figure(figsize = (12,6))
sns.barplot(x='quality', y = 'alcohol', data = df, palette = 'inferno')

In [None]:
fig, ax1 = plt.subplots(3,4, figsize=(22,16))
k = 0
columns = list(df.columns)
for i in range(3):
    for j in range(4):
            sns.barplot(df['quality'], df[columns[k]], ax = ax1[i][j])
            k += 1
plt.show()

## Here we see density , ph and residual sugar of all quality wine is nearly about same.

In [None]:
fig, ax1 = plt.subplots(3,4, figsize=(22,16))
k = 0
columns = list(df.columns)
for i in range(3):
    for j in range(4):
            sns.boxplot(df['quality'], df[columns[k]], ax = ax1[i][j])
            k += 1
plt.show()

## Much Outliers presents in some features.

In [None]:
plt.figure(figsize = (12,6))
sns.pairplot(df)
plt.show()

In [None]:
fig, ax1 = plt.subplots(3,4, figsize=(22,16))
k = 0
columns = list(df.columns)
for i in range(3):
    for j in range(4):
            sns.distplot(df[columns[k]], ax = ax1[i][j])
            k += 1
plt.show()

## Residual Sugar, Chlorides, Free Sulphur Dioxide, Total Sulphur Dioxide, and Sulphates are highly right skewed. So we need to tranform it.

In [None]:
def log_transform(col):
    return np.log(col[0])

df['residual sugar'] = df[['residual sugar']].apply(log_transform, axis=1)
df['chlorides'] = df[['chlorides']].apply(log_transform, axis=1)
df['free sulfur dioxide'] = df[['free sulfur dioxide']].apply(log_transform, axis=1)
df['total sulfur dioxide'] = df[['total sulfur dioxide']].apply(log_transform, axis=1)
df['sulphates'] = df[['sulphates']].apply(log_transform, axis=1)

In [None]:
df.corr()['quality'].sort_values(ascending=False)

### Alcohol, Sulphates, Volatile acidity have high correlations with the target variable. 

In [None]:
plt.figure(figsize = (12,6))
sns.heatmap(df.corr(),annot=True)

In [None]:
df.columns

In [None]:
df.drop(['fixed acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH'],axis=1,inplace=True)

# Here we can observe that

    1.Volatile acidity and alchohol have high correlations with the target variable.
    2.Here many independent features present that are actually highly dependent on the other independent feature.
      Like ph and fixed acidity, ph and citric acid.                                           

In [None]:
sns.scatterplot('alcohol','volatile acidity',style='quality',data=df)

## As lots of overlappting we use decision tree and also check accuracy after using random forest

In [None]:
X = df.drop('quality',axis=1)
Y = df['quality']

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(np.array(X),np.array(Y),test_size=0.2,random_state=42)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [None]:
lr = DecisionTreeClassifier()
lr.fit(X,Y)
pred = np.expm1(lr.predict(X_test))

In [None]:
acc=accuracy_score(Y_test,lr.predict(X_test))
acc

In [None]:
from sklearn.metrics import f1_score

In [None]:
f1=f1_score(Y_test,lr.predict(X_test),average='weighted')
f1

In [None]:
lr = RandomForestClassifier()
lr.fit(X,Y)
pred = np.expm1(lr.predict(X_test))

In [None]:
acc=accuracy_score(Y_test,lr.predict(X_test))
acc

In [None]:
f1=f1_score(Y_test,lr.predict(X_test),average='macro')
f1

In [None]:
from sklearn.metrics import precision_recall_fscore_support

In [None]:
precision_recall_fscore_support(Y_test,lr.predict(X_test),average='weighted')