In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
df = pd.read_csv("/content/wine.csv")
df.head()

Unnamed: 0,Wine,Alcohol,Malic.acid,Ash,Acl,Mg,Phenols,Flavanoids,Nonflavanoid.phenols,Proanth,Color.int,Hue,OD,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [None]:
'''
Alcohol: alcohol content, units of ABV (alcohol by volume).
Malic acid: organic acids found in wine
Ash: ash after evaporation and incineration.
Alcalinity of ash: how much base in ash .
Magnesium: affects flavor
Total phenols: affect the taste, color, and mouthfeel (texture) of wine.
Flavoids: type of phenol.

Nonflavoid phenols:type of phenol.

Proanthocyanins: type of phenol.

Color intensity: color intensity

Hue: hue of wine

OD: protein content measurements.

Proline: amino acid
'''

'\nAlcohol: alcohol content, units of ABV (alcohol by volume).\nMalic acid: organic acids found in wine\nAsh: ash after evaporation and incineration.\nAlcalinity of ash: how much base in ash .\nMagnesium: affects flavor\nTotal phenols: affect the taste, color, and mouthfeel (texture) of wine.\nFlavoids: type of phenol.\n\nNonflavoid phenols:type of phenol.\n\nProanthocyanins: type of phenol.\n\nColor intensity: color intensity\n\nHue: hue of wine\n\nOD: protein content measurements.\n\nProline: amino acid\n'

In [None]:
print(df.apply(lambda col: col.unique())) #unique values
print(df.apply(lambda col: col.nunique())) #count of unique values

Wine                                                            [1, 2, 3]
Alcohol                 [14.23, 13.2, 13.16, 14.37, 13.24, 14.2, 14.39...
Malic.acid              [1.71, 1.78, 2.36, 1.95, 2.59, 1.76, 1.87, 2.1...
Ash                     [2.43, 2.14, 2.67, 2.5, 2.87, 2.45, 2.61, 2.17...
Acl                     [15.6, 11.2, 18.6, 16.8, 21.0, 15.2, 14.6, 17....
Mg                      [127, 100, 101, 113, 118, 112, 96, 121, 97, 98...
Phenols                 [2.8, 2.65, 3.85, 3.27, 2.5, 2.6, 2.98, 2.95, ...
Flavanoids              [3.06, 2.76, 3.24, 3.49, 2.69, 3.39, 2.52, 2.5...
Nonflavanoid.phenols    [0.28, 0.26, 0.3, 0.24, 0.39, 0.34, 0.31, 0.29...
Proanth                 [2.29, 1.28, 2.81, 2.18, 1.82, 1.97, 1.98, 1.2...
Color.int               [5.64, 4.38, 5.68, 7.8, 4.32, 6.75, 5.25, 5.05...
Hue                     [1.04, 1.05, 1.03, 0.86, 1.02, 1.06, 1.08, 1.0...
OD                      [3.92, 3.4, 3.17, 3.45, 2.93, 2.85, 3.58, 3.55...
Proline                 [1065, 1050, 1

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Wine                  178 non-null    int64  
 1   Alcohol               178 non-null    float64
 2   Malic.acid            178 non-null    float64
 3   Ash                   178 non-null    float64
 4   Acl                   178 non-null    float64
 5   Mg                    178 non-null    int64  
 6   Phenols               178 non-null    float64
 7   Flavanoids            178 non-null    float64
 8   Nonflavanoid.phenols  178 non-null    float64
 9   Proanth               178 non-null    float64
 10  Color.int             178 non-null    float64
 11  Hue                   178 non-null    float64
 12  OD                    178 non-null    float64
 13  Proline               178 non-null    int64  
dtypes: float64(11), int64(3)
memory usage: 19.6 KB


In [None]:
df.isnull().sum() #total null values

Wine                    0
Alcohol                 0
Malic.acid              0
Ash                     0
Acl                     0
Mg                      0
Phenols                 0
Flavanoids              0
Nonflavanoid.phenols    0
Proanth                 0
Color.int               0
Hue                     0
OD                      0
Proline                 0
dtype: int64

In [None]:
from sklearn.preprocessing import StandardScaler

#standardization of numerical columns 
temp_df = df.loc[:, 'Alcohol':'Proline'] 

# define standard scaler
scaler = StandardScaler()

# fit & transform data
scaled_data= scaler.fit(temp_df)
temp_df = scaled_data.transform(temp_df) #transform returns numpy array

In [None]:
# convert numpy array to pandas dataframe
temp_df = pd.DataFrame(temp_df,columns=['Alcohol', 'Malic.acid',	'Ash',
                                        'Acl', 'Mg',	'Phenols', 'Flavanoids', 
                                        'Nonflavanoid.phenols',	'Proanth',
                                        'Color.int',	'Hue',	'OD', 'Proline'
                                        ])
print(temp_df)

      Alcohol  Malic.acid       Ash  ...       Hue        OD   Proline
0    1.518613   -0.562250  0.232053  ...  0.362177  1.847920  1.013009
1    0.246290   -0.499413 -0.827996  ...  0.406051  1.113449  0.965242
2    0.196879    0.021231  1.109334  ...  0.318304  0.788587  1.395148
3    1.691550   -0.346811  0.487926  ... -0.427544  1.184071  2.334574
4    0.295700    0.227694  1.840403  ...  0.362177  0.449601 -0.037874
..        ...         ...       ...  ...       ...       ...       ...
173  0.876275    2.974543  0.305159  ... -1.392758 -1.231206 -0.021952
174  0.493343    1.412609  0.414820  ... -1.129518 -1.485445  0.009893
175  0.332758    1.744744 -0.389355  ... -1.612125 -1.485445  0.280575
176  0.209232    0.227694  0.012732  ... -1.568252 -1.400699  0.296498
177  1.395086    1.583165  1.365208  ... -1.524378 -1.428948 -0.595160

[178 rows x 13 columns]


In [None]:
temp_df['Wine'] = df['Wine'].values #creating target variable of temp_df

In [None]:
#remove outlier
def remove_outlier(my_column,df):
  '''
  remove outliers from all columns
  find upper & lower limit for data points 
  delete outliers inplace    
  
  input: single column(pandas series)
         dataframe(pandas dataframe)
  return: None
     
  '''
  #outliers range = column mean + or - (3 * standard deviation of columns)
  upper_limit_outlier = temp_df[my_column].mean() + (3*temp_df[my_column].std())
  lower_limit_outlier = temp_df[my_column].mean() - (3*temp_df[my_column].std())

  temp_df.drop(temp_df[temp_df[my_column] > upper_limit_outlier].index, inplace = True) # upper limit outliers
  temp_df.drop(temp_df[temp_df[my_column] < lower_limit_outlier].index, inplace = True) # lower limit outliers

for col in temp_df.iloc[:,:-1]:
    remove_outlier(col,temp_df)

In [None]:
target = temp_df['Wine'] # target variable
df = temp_df.drop('Wine',axis=1)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df,target, 
                                                    test_size = 0.2, 
                                                    random_state = 10,
                                                    stratify= target)

In [None]:
"""

Euclidean distance
Hamming distance
"""
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5) # n_neighbors to consider for distance calculation 
knn.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [None]:
knn.score(x_test, y_test)

0.9705882352941176

# Naive Bayes 
Bayes theorem to predict the occurrence of any event
1. calculate the prob of all features for class 1 <br>
   calculate the prob of all features for class 2 
2. calculate initial guess/prior prob for class 1 and class2 both <br>
   * n of class 1 /(num cls1 + num cls 2) <br>
   * n of class 2 /(num cls1 + num cls 2)
3. for new class prediction <br>
   * take prob of each feature multiply all of them with prior prob of the class 1  
   * take prob of each feature multiply all of them with prior prob of the class 2

**Note** if a word is present in class 1 but not in class 2 or vice versa then put count as 1 instead of 0,
because if that word comes in testing then everything will get multiplied by 0

4. small change 
  if there are huge multiplications then number is very close to 0.00000000 <br>
  solution: take log of each prob  
  and add those logs



In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
y_pred = gnb.fit(x_train, y_train).predict(x_test)

In [None]:
gnb.score(x_test, y_test)

0.9705882352941176