In [None]:
import pandas as pd
import numpy as np
df=pd.read_csv("../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv")
df.head()

In [None]:
df.describe()

In [None]:
df.info()

## Prediction without preprocessing

## train_test_split

In [None]:
X=df.drop("quality",axis=1)
y=df["quality"]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 43)

## Fitting the model

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import ExtraTreesClassifier 
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [None]:
rfc=RandomForestClassifier()
rfc.fit(X_train,y_train)

In [None]:
rfc.score(X_test,y_test)

In [None]:
xgb=XGBClassifier()
xgb.fit(X_train,y_train)
xgb.score(X_test,y_test)

## Got an accuracy of 72% with out any preprocessing

## With preprocessing and exploratory data analysis

In [None]:
import seaborn as sns
sns.pairplot(df)

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize = (20,10))
sns.heatmap(df.corr(), annot = True)

##### Find no correlations between the features so no feature removing

In [None]:
sns.countplot(df['quality'])

## class imbalance occured

In [None]:
df.var()

## we can check the variance in total sulfur dioxide was more and have to apply log transformation


In [None]:
df["total sulfur dioxide"]=np.log(df["total sulfur dioxide"])
df["free sulfur dioxide"]=np.log(df["free sulfur dioxide"])

In [None]:
df.var()

## variance corrected

## Removing outliers

In [None]:
fig,ax = plt.subplots(ncols = 6, nrows = 2, figsize = (20,10))
ax = ax.flatten()
index = 0
for col in df.columns:
    sns.boxplot(col,data = df,ax = ax[index])
    index+=1
plt.tight_layout(pad=0.5, w_pad=0.7, h_pad=5.0)

## we can observe a lot of outliers here and have to be removed 

In [None]:
for i in df.columns:
    print(i)
    df[i].hist()
    plt.show()

## we can observe class imbalance in quality and skewness in some features

In [None]:
df.skew()

## Normalizing data

In [None]:
df["residual sugar"]=np.log(df["residual sugar"])
df["chlorides"]=np.log(df["chlorides"])

In [None]:
df.skew()

In [None]:
fig,ax = plt.subplots(ncols = 6, nrows = 2, figsize = (20,10))
ax = ax.flatten()
index = 0
for col in df.columns:
    sns.boxplot(col,data = df,ax = ax[index])
    index+=1
plt.tight_layout(pad=0.5, w_pad=0.7, h_pad=5.0)

## smot method for class imbalance

In [None]:
X1=df.drop("quality",axis=1)
y1=df["quality"]

In [None]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE(k_neighbors=4)
# transform the dataset
X1, y1 = oversample.fit_resample(X1, y1)


In [None]:
x_train, x_test, Y_train, Y_test = train_test_split(X1, y1, test_size=0.25, random_state=42)

In [None]:
rfc2=RandomForestClassifier()
rfc2.fit(x_train,Y_train)

In [None]:
rfc2.score(x_test,Y_test)

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
model1 = ExtraTreesClassifier() 
model1.fit(x_train,Y_train)
model1.score(x_test,Y_test)

## acheived an accuracy of 86 percent with out removing outliers

## Standardizing and predicting

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)


In [None]:
rfc11=ExtraTreesClassifier()
rfc11.fit(x_train,Y_train)
rfc11.score(x_test,Y_test)

# Acheived 86 percent accuracy on test set hurray

## rechecking with cross validation validation

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X1 = sc.fit_transform(X1)
X1= sc.transform(X1)


In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(rfc11, X1, y1, cv=5)

In [None]:
print(scores.mean())