In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression

In [2]:
#Reading csv
df = pd.read_csv('winequalityN.csv')
df.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [3]:
#Describe dataframe
df.describe

<bound method NDFrame.describe of        type  fixed acidity  volatile acidity  citric acid  residual sugar  \
0     white            7.0             0.270         0.36            20.7   
1     white            6.3             0.300         0.34             1.6   
2     white            8.1             0.280         0.40             6.9   
3     white            7.2             0.230         0.32             8.5   
4     white            7.2             0.230         0.32             8.5   
...     ...            ...               ...          ...             ...   
6492    red            6.2             0.600         0.08             2.0   
6493    red            5.9             0.550         0.10             2.2   
6494    red            6.3             0.510         0.13             2.3   
6495    red            5.9             0.645         0.12             2.0   
6496    red            6.0             0.310         0.47             3.6   

      chlorides  free sulfur dioxide  tot

In [4]:
#Check for null columns
df.isnull().any()

type                    False
fixed acidity            True
volatile acidity         True
citric acid              True
residual sugar           True
chlorides                True
free sulfur dioxide     False
total sulfur dioxide    False
density                 False
pH                       True
sulphates                True
alcohol                 False
quality                 False
dtype: bool

In [5]:
#Calculating mean
df.mean()

fixed acidity             7.216579
volatile acidity          0.339691
citric acid               0.318722
residual sugar            5.444326
chlorides                 0.056042
free sulfur dioxide      30.525319
total sulfur dioxide    115.744574
density                   0.994697
pH                        3.218395
sulphates                 0.531215
alcohol                  10.491801
quality                   5.818378
dtype: float64

In [6]:
#Filling nan values with mean
df = df.fillna(df.mean())

In [7]:
df.isnull().any()

type                    False
fixed acidity           False
volatile acidity        False
citric acid             False
residual sugar          False
chlorides               False
free sulfur dioxide     False
total sulfur dioxide    False
density                 False
pH                      False
sulphates               False
alcohol                 False
quality                 False
dtype: bool

In [8]:
#features and labels
y = df['quality']
x = df.drop(columns=['quality','type'])

In [9]:
x

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.450000,8.8
1,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.490000,9.5
2,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.440000,10.1
3,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.400000,9.9
4,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.400000,9.9
...,...,...,...,...,...,...,...,...,...,...,...
6492,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.580000,10.5
6493,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.531215,11.2
6494,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.750000,11.0
6495,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.710000,10.2


In [10]:
y

0       6
1       6
2       6
3       6
4       6
       ..
6492    5
6493    6
6494    6
6495    5
6496    6
Name: quality, Length: 6497, dtype: int64

In [11]:
#Test Train split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [12]:
#Regression model
reg = LinearRegression()
reg.fit(x_train,y_train)
y_pred_reg = reg.predict(x_test)

In [22]:
y_pred_reg

array([5.9825045 , 5.50065734, 5.70634711, ..., 5.43966593, 5.35667918,
       5.01844828])

In [13]:
#KNN model
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(x_train,y_train)

KNeighborsClassifier(n_neighbors=2)

In [14]:
y_pred = knn.predict(x_test)
y_pred

array([6, 6, 6, ..., 5, 5, 5], dtype=int64)

In [15]:
#Random Forest
random = RandomForestClassifier(max_depth=3, random_state=0)
random.fit(x_train,y_train)

RandomForestClassifier(max_depth=3, random_state=0)

In [16]:
y1_pred = random.predict(x_test)
y1_pred

array([6, 5, 5, ..., 5, 5, 5], dtype=int64)

In [17]:
#SVM
svm = SVC()
svm.fit(x_train,y_train)

SVC()

In [18]:
y2_pred=svm.predict(x_test)
y2_pred

array([6, 6, 6, ..., 6, 6, 6], dtype=int64)

In [19]:
new_y_pred = np.reshape(y_pred,(1300,1))
new_y1_pred = np.reshape(y1_pred,(1300,1))
new_y2_pred = np.reshape(y2_pred,(1300,1))


In [20]:
new_df = pd.DataFrame(new_y_pred)
new_df

Unnamed: 0,0
0,6
1,6
2,6
3,5
4,5
...,...
1295,5
1296,5
1297,5
1298,5


In [21]:
new_df['RandomForest'] = new_y1_pred
new_df['SVM'] = new_y2_pred
new_df

Unnamed: 0,0,RandomForest,SVM
0,6,6,6
1,6,5,6
2,6,5,6
3,5,5,6
4,5,6,6
...,...,...,...
1295,5,5,6
1296,5,5,6
1297,5,5,6
1298,5,5,6
