In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Wine Classification

There are 3 types of wines in this dataset.

Scope of the project is to find the type of wine given data

Feature present in the dataset are:
1. Alcohol
2. Malic acid
3. Ash
4. Alcalinity of ash
5. Magnesium
6. Total phenols
7. Flavanoids
8. Nonflavanoid phenols
9. Proanthocyanins
10. Color intensity
11. Hue
12. OD280/OD315 of diluted wines
13. Proline

In [None]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

%matplotlib inline

In [None]:
#importing dataset
wine_data = pd.read_csv('../input/wine-dataset/Wine.csv')

In [None]:
wine_data.head()

Totally there are 13 features and one target variable(Types)

## Exploratory Data Analysis

In [None]:
wine_data.shape

In [None]:
#Let's check the class count of target
sns.countplot(x='Types',data=wine_data);

In [None]:
wine_data['Types'].value_counts()

In [None]:
#Let's check distribution of the each feature
sns.pairplot(data=wine_data,palette='rainbow');

In [None]:
# Distribution plots of all features in the dataset
wine_columns = list(wine_data.columns[1:])

for i in wine_columns:
    wine_data[i].hist()
    plt.title(i)
    plt.show()


**From Above we infer that data follows standard normal/Gaussian distribution but some of the
features are skewed although it is gaussian distributed**

In [None]:
# Let's check is there any outliers in the data with respect to types of wine
sns.boxplot(x='Types',y='Alcohol',data=wine_data);

In [None]:
sns.boxplot(x='Types',y='Malic acid',data=wine_data);

In [None]:
wine_data.columns

In [None]:
# Univariate outliers
sns.boxplot(y='Magnesium',data=wine_data,color='green');

In [None]:
sns.boxplot(y='Alcalinity of ash',data=wine_data,color='green');

In [None]:
sns.boxplot(y='Ash',data=wine_data,color='violet');

From above plots, we infer that there are both univariate and multivariate outliers present in the data.
So we have to treat the outliers before modelling

In [None]:
#Let's check correlation relation between features
wine_data.corr()

In [None]:
#Visualizing correlation matrix in sophistic way
plt.figure(figsize=(10,7))
sns.heatmap(wine_data.corr(),cbar=False,cmap='rocket',annot=True);

*From correlation matrix, it is found that there are no greater linear relationship 
between independent variables and target variable*

## Data Cleaning

In [None]:
wine_data.dtypes

All the types of features are integer or float, So we need to not worry about categorical data types

In [None]:
# Let's find any missing values in the dataset
wine_data.isnull().sum()

There are no missing values in the dataset

## Modelling

In [None]:
# Separating independent and dependent variables
X = wine_data.drop('Types',axis=1)
y = wine_data['Types']


In [None]:
# Splitting the data into train and test dataset
np.random.seed(42)
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,stratify=y)

print('After splitting the data ')
print('Shape of train set: ',X_train.shape)
print('Shape of test set: ',X_test.shape)

In [None]:
# Let's try Logistic Regression model
np.random.seed(42)

from sklearn.linear_model import LogisticRegression

base_model = LogisticRegression(max_iter=5000)

base_model.fit(X_train,y_train)
base_model.score(X_test,y_test)

## Evaluation of the model

In [None]:
def evaluation_metrics(model):
    y_preds = model.predict(X_test)
    print('Model accuarcy score: ',accuracy_score(y_test,y_preds))
    print('Model Cross validation score: ',np.mean(cross_val_score(model,X,y,cv=5)))
    print('confusion Matrix \n',confusion_matrix(y_test,y_preds))
    

In [None]:
# Lets evaluate the model
evaluation_metrics(base_model)

Since data follows standard Gaussian distribution and each feature is in different scales, we will apply
Standardization to the data and again apply ML algorithm to the data

## Standardization process

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler = scaler.fit(X)
X_scaled = scaler.transform(X)

X_scaled = pd.DataFrame(X_scaled,columns=X.columns)

In [None]:
# Splitting the data into train and test dataset
np.random.seed(42)
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X_scaled,y,test_size=0.2,stratify=y)

print('After splitting the data ')
print('Shape of train set: ',X_train.shape)
print('Shape of test set: ',X_test.shape)

In [None]:
# Let's try Logistic Regression model again
np.random.seed(42)

from sklearn.linear_model import LogisticRegression

std_model = LogisticRegression(max_iter=5000)

std_model.fit(X_train,y_train)
std_model.score(X_test,y_test)

In [None]:
print('Evalaution Metric of the model after applying Standardization...')
evaluation_metrics(std_model)