In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
# Loading Dataset
df = pd.read_csv('/kaggle/input/winequalityred/winequality-red.csv')

In [None]:
# Getting know the data and cleaning it
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()*len(df)*100

In [None]:
df.corr()

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df.drop_duplicates(keep="first", inplace=True)

In [None]:
sns.pairplot(df)

In [None]:
px.histogram(df,x="quality",color ="quality")

In [None]:
df.hist(figsize=(30,30))
plt.show()

In [None]:
fig=px.imshow(df.corr())
fig.show()

In [None]:
# Positive strong correlations: alcohol, sulphate
# Negative strong correlations: volatile acidity

In [None]:
fig1=px.scatter(df, x="quality", y="alcohol")
fig1.show()

fig2=px.scatter(df, x="quality", y="sulphates")
fig2.show()

fig3=px.scatter(df, x="quality", y="volatile acidity")
fig3.show()

In [None]:
df["quality"].value_counts()

In [None]:
def quality(a):
    if a<=6:
        return "bad"
    elif a>6:
        return "good"
    
df["quality"]=df["quality"].apply(quality)

In [None]:
df["quality"].value_counts()

In [None]:
from sklearn import preprocessing
le=preprocessing.LabelEncoder()
df.quality=le.fit_transform(df.quality)

x=df.drop("quality",axis=1)
y=df["quality"]

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.33,random_state=0)

In [None]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train=sc.fit_transform(x_train)
X_test=sc.transform(x_test)

### Classification Alghorithms

In [None]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression(random_state=0)
lr.fit(X_train, y_train)


y_pred=lr.predict(X_test)
from sklearn.metrics import confusion_matrix
from sklearn import metrics
cm=confusion_matrix(y_test, y_pred)
acc=metrics.accuracy_score(y_test, y_pred)


print(cm)
print(acc)

In [None]:
#KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=1,metric="minkowski")
knn.fit(X_train,y_train)

y_pred=knn.predict(X_test)
cm=confusion_matrix(y_test,y_pred)
acc = metrics.accuracy_score(y_test, y_pred)
print(cm)
print(acc)

In [None]:
#SVC
from sklearn.svm import SVC
svc=SVC(kernel="poly")
svc.fit(X_train,y_train)

y_pred=svc.predict(X_test)
cm=confusion_matrix(y_test,y_pred)
acc = metrics.accuracy_score(y_test, y_pred)
print(cm)
print(acc)

In [None]:
#GaussianNB
from sklearn.naive_bayes import GaussianNB
gnb=GaussianNB()
gnb.fit(X_train,y_train)

y_pred=gnb.predict(X_test)
cm=confusion_matrix(y_test,y_pred)
acc = metrics.accuracy_score(y_test, y_pred)
print(cm)
print(acc)

In [None]:
#DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
dtc=DecisionTreeClassifier(criterion="entropy")
dtc.fit(X_train,y_train)
y_pred=dtc.predict(X_test)
cm=confusion_matrix(y_test,y_pred)
acc = metrics.accuracy_score(y_test, y_pred)
print(cm)
print(acc)

In [None]:
#RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier(n_estimators=10,criterion="entropy")
rfc.fit(X_train,y_train)
y_pred=rfc.predict(X_test)
cm=confusion_matrix(y_test,y_pred)
acc = metrics.accuracy_score(y_test, y_pred)
print(cm)
print(acc)

In [None]:
#xgbosst
import xgboost
xgb = xgboost.XGBClassifier()
xgb_model = xgb.fit(X_train,y_train)
y_pred=xgb_model.predict(X_test)
cm=confusion_matrix(y_test,y_pred)
acc = metrics.accuracy_score(y_test, y_pred)
print(cm)
print(acc)