In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Prediction of Quality for Different Type of Wine based on Different Feature Sets
Using Supervised Machine Learning Techniques**

**Decision Tree, KNN ,Random Tree**

# Decription:
1 - fixed acidity
2 - volatile acidity
3 - citric acid
4 - residual sugar
5 - chlorides
6 - free sulfur dioxide
7 - total sulfur dioxide
8 - density
9 - pH
10 - sulphates
11 - alcohol Output variable
12 - quality (score between 0 and 10) 
    

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
df=pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
df.head()

In [None]:
df.info()

# checking for datatype and other information related to the dataset

In [None]:
df.describe()

In [None]:
df.duplicated().sum()
df.drop_duplicates(inplace = True)
df.duplicated().sum()

# checking for duplicated records 

In [None]:
df.isnull().sum()
# no null value is present

In [None]:
df.corr()

In [None]:
corr_features =[]

for i , r in df.corr().iterrows():  
    k=0
    for j in range(len(r)):
        if i!= r.index[k]:
            if r.values[k] >=0.5:
                corr_features.append([i, r.index[k], r.values[k]])
        k += 1
corr_features
# The iterrows () is responsible for loop through each row of the DataFrame. 
# It returns an iterator that contains index and data of each row as a Series.

In [None]:
feat =[]
for i in corr_features:
    if i[2] >= 0.6:
        feat.append(i[0])
        feat.append(i[1])
        print(feat)
        
# this shows that correlation is less than .7 so we will not drop any column based on this

In [None]:
df.corr()['quality']

In [None]:
# df.drop(['residual sugar'],axis=1,inplace=True)

In [None]:
fig = plt.figure(figsize = (10,5))
sns.heatmap(df.corr(), annot = True, fmt = '0.2f',  cmap="rainbow")

In [None]:
fig = plt.figure(figsize = (10,20))

plt.pie(df.quality.value_counts(), 
        autopct='%1.1f%%',
        labels=df.quality.unique());

# visualising the counts of our target feature

In [None]:
plt.hist(df.quality,bins=6,histtype='bar')

plt.title('Distribution of the Quality')
plt.xlabel('Quality')
plt.ylabel('Count')
plt.show()


In [None]:
# #Composition of chloride and quality of the wine
fig = plt.figure(figsize = (10,6))
sns.barplot(x = 'quality', y = 'chlorides', data = df)

In [None]:
for i in df.iloc[:,:-1].columns :
    sns.boxplot( df[i] , data = df )
    print(i)    
    plt.show()
    
# boxplot for outliers

In [None]:
def boxoutlier(var):
    for x in var.iloc[:,:-1].columns :        
        Q1=var[x].quantile(0.25)
        Q3=var[x].quantile(0.75)
        IQR=Q3-Q1
        Lower = Q1-(1.5*IQR)
        Upper = Q3+(1.5*IQR)

        var.loc[:,x]=np.where(var[x].values > Upper,Upper,var[x].values)
        var.loc[:,x]=np.where(var[x].values < Lower,Lower,var[x].values)        
    return var

df=boxoutlier(df)

In [None]:
for i in df.iloc[:,:-1].columns :
    sns.boxplot( df[i] , data = df )
    print(i)   
    plt.show()

In [None]:
sns.distplot(df['alcohol'])

In [None]:
# sns.pairplot(data=df)

In [None]:
df.groupby('quality')['alcohol','total sulfur dioxide', 'chlorides'].min()

In [None]:
sns.barplot(df.quality, df.alcohol, data=df, palette='coolwarm')
plt.title("Barplot of Quality and Alcohol")
plt.show()

In [None]:
#Sulphates level goes higher with the quality of wine
sns.barplot(x = 'quality', y = 'sulphates', data = df)

In [None]:
sns.violinplot(x="quality", y='citric acid', data = df)

In [None]:
sns.boxplot(x = df['density'], data= df, palette='rainbow', orient='h', )
plt.show()

In [None]:
X=df.drop('quality',axis=1)

In [None]:
y=df['quality']


In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=4 )
display(X_train.head(),y_train.head(),'Testing Data',X_test.head(),y_test.head())

In [None]:
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor

In [None]:
clf=RandomForestClassifier(n_estimators=500)
clf.fit(X_train, y_train)
pred=clf.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,pred)

In [None]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
print(classification_report(y_test,pred))

## here we can see that for class 3,4 model is not working properly
## so we will classify the data into binary values 

In [None]:
df1=pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
df1.head()

In [None]:
df1.quality.unique()
sns.countplot(x="quality", data=df1)

In [None]:
# From the observation above, We can determine the quality between good quality wine and bad quality wine. 
# The quality score from 1-5 will be considered as bad quality (0)and 
# the quality score from 6-10 are good quality scores(1).

In [None]:
df1['quality']=df1['quality'].apply(lambda x:1 if x>5 else 0)

In [None]:
fig = plt.figure(figsize = (10,6))

sns.barplot(x = 'quality', y = 'chlorides', data = df1)

In [None]:
df1.head()

In [None]:
df1.drop(['residual sugar'],axis=1,inplace=True)

In [None]:
X=df1.drop('quality',axis=1)

In [None]:
y=df1['quality']

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=4 )
display(X_train.head(),y_train.head(),'Testing Data',X_test.head(),y_test.head())

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
clf=RandomForestClassifier(n_estimators=65)
clf.fit(X_train, y_train)
pred=clf.predict(X_test)


In [None]:
pred_train=clf.predict(X_train)
pred_train

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_train,pred_train)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,pred)

In [None]:
from sklearn.metrics import accuracy_score,classification_report
print(classification_report(y_test,pred))

In [None]:
# Accuracy of  testing data is 79 % in RandomForestClassifier when we did binary qualification of the target variable.
# Earlier many classes were present thats why model was not performing well .