**This is my first Kaggle notebook. Please do comment if anything needs to be corrected. I am a beginner in Machine Learning.**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)

# Reading the files

In [None]:
red_wine=pd.read_csv('../input/wine-quality-red/winequal-red.csv',delimiter=';')

# Checking the file information and missing values.

**WORKING WITH RED WINE CASE**

In [None]:
red_wine.info()

In [None]:
red_wine.isnull().sum()

In [None]:
red_wine['quality'].unique()

In [None]:
red_wine.head()

# EDA

In [None]:
plt.hist(x='quality',data=red_wine);

***From the histogram we can observe that the dataset is imbalanced as most wines belong to QUALITY SCORE 5/6/7 and minimum cases of scores 3/4/8. Hence, we need to correctly preprocess the data so that we don't obtain a bad model and decide on correct Evaluation Metrics.***

In [None]:
(red_wine.groupby('quality')['quality'].count()/red_wine.shape[0])*100

In [None]:
fig, ax = plt.subplots(nrows=2,ncols=2);
figs=plt.gcf();
figs.set_size_inches(7,7)
plt.subplots_adjust(left=0.06, bottom=0, right=2.2, top=0.8, wspace=None, hspace=0.35)
sns.barplot(x='quality',y='fixed acidity',data=red_wine,palette='colorblind',ax=ax[0,0]);
sns.barplot(x='quality',y='volatile acidity',data=red_wine,palette='muted',ax=ax[0,1]);
sns.barplot(x='quality',y='citric acid',data=red_wine,palette='dark',ax=ax[1,0]);
sns.barplot(x='quality',y='residual sugar',data=red_wine,palette='deep',ax=ax[1,1]);

In [None]:
fig, ax = plt.subplots(nrows=2,ncols=2);
figs=plt.gcf();
figs.set_size_inches(7,7)
plt.subplots_adjust(left=0.06, bottom=0, right=2.2, top=0.8, wspace=None, hspace=0.35)
sns.barplot(x='quality',y='chlorides',data=red_wine,palette='colorblind',ax=ax[0,0]);
sns.barplot(x='quality',y='free sulfur dioxide',data=red_wine,palette='muted',ax=ax[0,1]);
sns.barplot(x='quality',y='total sulfur dioxide',data=red_wine,palette='dark',ax=ax[1,0]);
sns.barplot(x='quality',y='density',data=red_wine,palette='deep',ax=ax[1,1]);

In [None]:
fig, ax = plt.subplots(nrows=2,ncols=2);
figs=plt.gcf();
figs.set_size_inches(7,7)
plt.subplots_adjust(left=0.06, bottom=0, right=2.2, top=0.8, wspace=None, hspace=0.35)
sns.barplot(x='quality',y='pH',data=red_wine,palette='colorblind',ax=ax[0,0]);
sns.barplot(x='quality',y='sulphates',data=red_wine,palette='muted',ax=ax[0,1]);
sns.barplot(x='quality',y='alcohol',data=red_wine,palette='dark',ax=ax[1,0]);

**Observations:**

1. Fixed Acidity, Residual Sugar, Density, pH, Alcohol remain almost same across wines with different quality scores.

2. The wine with the Highest Quality score has:
       (i) the lowest value of volatile acidity/chlorides.
       (ii) highest value of citric acid, sulphates.
       In other words, the amount of volatile acidity/chlorides decreases
       as we go from wine with lowest to the highest quality and vice versa.
       Secondly, the amount of citric acid/sulphates increases as we move from  
       low quality to high quality wines. 

3. Total Sulfur dioxide is present mostly in wine with mediocre quality scores and decreases as the quality goes too low or high.


In [None]:
fig, ax = plt.subplots(nrows=1,ncols=2);
figs=plt.gcf();
figs.set_size_inches(5,4)
plt.subplots_adjust(left=0.06, bottom=0, right=2.2, top=0.8, wspace=None, hspace=0.35)
sns.scatterplot(x='citric acid',y='volatile acidity',data=red_wine,ax=ax[0]);
sns.scatterplot(x='sulphates',y='volatile acidity',data=red_wine,palette='dark',ax=ax[1]);

# CHECKING CORRELATION MATRIX.

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(red_wine.drop(['quality'],axis=1).corr(),annot=True);

# IMPORTING NECESSARY LIBRARIES

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score,recall_score,accuracy_score,multilabel_confusion_matrix,classification_report
from sklearn.preprocessing import MinMaxScaler,LabelEncoder

# FURTHER PRE-PROCESSING

In [None]:
# Because, there are a number of quality scores in this dataset and some of whom are present in very less number.
# Therefore, it could be better if we could reduce the number of categories by assigning a different categories
# for different range of quality scores.

def transform_quality(q):
  res=''
  if (q>=3 and q<=6):
    res='Bad'
  else:
    res='Good'
  return res

In [None]:
red_wine['quality']=red_wine['quality'].apply(transform_quality)

# PREPARING TRAINING AND VALIDATION DATASET.

In [None]:
features=[i for i in red_wine.drop(['quality'],axis=1).columns]

In [None]:
X=red_wine.drop(['quality'],axis=1)
y=red_wine['quality']

In [None]:
scaler=MinMaxScaler()
le=LabelEncoder()

In [None]:
X=scaler.fit_transform(X)
y=le.fit_transform(y)

In [None]:
X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.20,random_state=42)

# APPLYING MACHINE LEARNING ALGORITHMS.

# LOGISTIC REGRESSION.

In [None]:
LR_model=LogisticRegression()

In [None]:
LR_model.fit(X_train,y_train)

In [None]:
accuracy_score(y_val,LR_model.predict(X_val))

In [None]:
accuracy_score(y_train,LR_model.predict(X_train))

In [None]:
print(classification_report(y_train,LR_model.predict(X_train)))

# RANDOM FOREST CLASSIFIER.

In [None]:
RFC_model=RandomForestClassifier(n_estimators=200)

In [None]:
RFC_model.fit(X_train,y_train)

In [None]:
accuracy_score(y_val,RFC_model.predict(X_val))

In [None]:
print(classification_report(y_val,RFC_model.predict(X_val)))

# KNeighboursClassifier.

In [None]:
KNC=KNeighborsClassifier(n_neighbors=3)

In [None]:
KNC.fit(X_train,y_train)

In [None]:
accuracy_score(y_val,KNC.predict(X_val))

In [None]:
print(classification_report(y_val,KNC.predict(X_val)))