## Wine Quality prediction
- Attribute Information:

- For more information, read
- Input variables (based on physicochemical tests):
   1. fixed acidity
   2.  volatile acidity
   3. citric acid
   4. residual sugar
   5. chlorides
   6. free sulfur dioxide
   7.  total sulfur dioxide
   8.  density
   9.  pH
 10.  sulphates
 11.  alcohol
  - Output variable (based on sensory data):
  12.  quality (score between 0 and 10)
  
  - all attribute are numerical .We will found out the quality of the wine
  - quality>6.5 -> good
  - quality<6.5 -> bad 

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Load the library

In [None]:
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd
import seaborn as sns 
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

## read the data

In [None]:
data=pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')

## Data analysis

In [None]:
print(data.shape)
data.head()

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
data.isnull().sum()

there are no null value present in the dataset

In [None]:
data.quality.value_counts()

In [None]:
plt.figure(figsize=(20,15))
sns.heatmap(data.corr(), annot=True, cmap='viridis')

In [None]:
data['pH'].plot.hist()

In [None]:
sns.pairplot(data=data)

## check the outlier 

In [None]:
plt.figure(figsize=(15,10))
sns.boxplot(data=data)

To more clear the outlier we check for he single feature

In [None]:
sns.boxplot(data['total sulfur dioxide'])

In [None]:
data['residual sugar'].plot.hist()

## To handle the outlier we use the standard deviation method


### for the free sulfur dioxide 

In [None]:
lower,upper=data['free sulfur dioxide'].mean() - 3*data['free sulfur dioxide'].std(), data['free sulfur dioxide'].mean() + 3*data['free sulfur dioxide'].std()
print(lower, upper)
data1=data[(data['free sulfur dioxide']>lower) & (data['free sulfur dioxide']<upper)]
data.shape[0] - data1.shape[0]

## for the total sulfur dioxide

In [None]:
lower,upper=data1['total sulfur dioxide'].mean() - 3*data1['total sulfur dioxide'].std(), data1['total sulfur dioxide'].mean() + 3*data1['total sulfur dioxide'].std()
print(lower, upper)
data2=data1[(data1['total sulfur dioxide']>lower) & (data1['total sulfur dioxide']<upper)]
data1.shape[0] - data2.shape[0]

check for the residual sugar

In [None]:
lower,upper=data2['residual sugar'].mean() - 3*data2['residual sugar'].std(), data2['residual sugar'].mean() + 3*data2['residual sugar'].std()
print(lower, upper)
data3=data2[(data2['residual sugar']>lower) & (data2['residual sugar']<upper)]
data2.shape[0] - data3.shape[0]

## check for the fixed acidity

In [None]:
lower,upper=data3['fixed acidity'].mean() - 3*data3['fixed acidity'].std(), data3['fixed acidity'].mean() + 3*data3['fixed acidity'].std()
print(lower, upper)
data4=data3[(data3['fixed acidity']>lower) & (data3['fixed acidity']<upper)]
data3.shape[0] - data4.shape[0]

In [None]:
sns.countplot(data4['quality'])

## now convert the quality into two  part 
- quality >6.5 -> good
- quality<6.5 -> bad

In [None]:
data4['quality']=[1 if x>6 else 0 for x in data4['quality']]

In [None]:
data4['quality'].value_counts()

## convert the data into dependent and independent data set

In [None]:
X=data4.drop('quality', axis=1)
y=data4['quality']

## split the data into training and test data

In [None]:
from sklearn.model_selection import train_test_split
x_train , x_test  ,y_train , y_test=train_test_split(X, y, test_size=0.3, random_state=1)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
x_train_scaler=scaler.fit_transform(x_train)
x_test_scaler=scaler.fit_transform(x_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

## Logistic regression

In [None]:
model1=LogisticRegression(C=0.01)
model1.fit(x_train ,y_train)
pred1=model1.predict(x_test)
print("trainin set score :{}".format(model1.score(x_train , y_train)))
print("testing set score :{}".format(model1.score(x_test, y_test)))


In [None]:
model1=LogisticRegression(C=1)
model1.fit(x_train_scaler ,y_train)
pred1=model1.predict(x_test_scaler)
print("trainin set score :{}".format(model1.score(x_train_scaler , y_train)))
print("testing set score :{}".format(model1.score(x_test_scaler, y_test)))


In [None]:
model2=KNeighborsClassifier()
model2.fit(x_train ,y_train)
pred2=model2.predict(x_test)
print("trainin set score :{}".format(model2.score(x_train , y_train)))
print("testing set score :{}".format(model2.score(x_test, y_test)))
print("accuracy score : {}".format(confusion_matrix(y_test, pred2)))

In [None]:
model2=KNeighborsClassifier(n_neighbors=3)
model2.fit(x_train_scaler ,y_train)
pred2=model2.predict(x_test_scaler)
print("trainin set score :{}".format(model2.score(x_train_scaler , y_train)))
print("testing set score :{}".format(model2.score(x_test_scaler, y_test)))
print("accuracy score : {}".format(confusion_matrix(y_test, pred2)))

In [None]:
model3=DecisionTreeClassifier(random_state=0, max_depth=3)
model3.fit(x_train ,y_train)
pred3=model3.predict(x_test)
print("trainin set score :{}".format(model3.score(x_train , y_train)))
print("testing set score :{}".format(model3.score(x_test, y_test)))
print("accuracy score : {}".format(confusion_matrix(y_test, pred3)))

In [None]:
model4=SVC()
model4.fit(x_train ,y_train)
pred4=model4.predict(x_test)
print("trainin set score :{}".format(model4.score(x_train , y_train)))
print("testing set score :{}".format(model4.score(x_test, y_test)))
print("accuracy score : {}".format(confusion_matrix(y_test, pred4)))

In [None]:
model5=RandomForestClassifier(random_state=1, max_depth=7)
model5.fit(x_train ,y_train)
pred5=model5.predict(x_test)
print("trainin set score :{}".format(model5.score(x_train , y_train)))
print("testing set score :{}".format(model5.score(x_test, y_test)))
print("accuracy score : {}".format(confusion_matrix(y_test, pred5)))


In [None]:
## in randomForest classifer max_depth is used to control the underfitting and overfitting problem .In this data when we increase the depth accuracy increase but at a certain 
## iteration model are overfitting.