# PCA - Wine Data - Logistic Regression 

### Importing Modules

In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

#sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

### Loading Data

In [2]:
data = pd.read_csv('wine.csv')
#view data
data.head()

Unnamed: 0,Type,Alcohol,Malic,Ash,Alcalinity,Magnesium,Phenols,Flavanoids,Nonflavanoids,Proanthocyanins,Color,Hue,Dilution,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [3]:
#shape
data.shape

(178, 14)

In [4]:
#check is any null values
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Type             178 non-null    int64  
 1   Alcohol          178 non-null    float64
 2   Malic            178 non-null    float64
 3   Ash              178 non-null    float64
 4   Alcalinity       178 non-null    float64
 5   Magnesium        178 non-null    int64  
 6   Phenols          178 non-null    float64
 7   Flavanoids       178 non-null    float64
 8   Nonflavanoids    178 non-null    float64
 9   Proanthocyanins  178 non-null    float64
 10  Color            178 non-null    float64
 11  Hue              178 non-null    float64
 12  Dilution         178 non-null    float64
 13  Proline          178 non-null    int64  
dtypes: float64(11), int64(3)
memory usage: 19.6 KB


As seen in the info there are no null or missing values

### Train Test Split

In [5]:
#split data
X = data.iloc[:,0:13].values
y = data.iloc[:,13].values

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### StandardScaler

In [7]:
#Calling Standard Scalar
sc = StandardScaler()

In [8]:
#fit and transform
X_train = sc.fit_transform(X_train)

In [9]:
#transform test
X_test = sc.transform(X_test)

### PCA

In [10]:
#Calling PCA with components 2
pca = PCA(n_components=2)

In [11]:
#Fit_transform
X_train = pca.fit_transform(X_train)

In [12]:
#Transform
X_test = pca.transform(X_test)

In [14]:
#Explained Varience
exp_var = pca.explained_variance_ratio_
exp_var

array([0.40840297, 0.16864436])

### Logistic Regression

In [15]:
#Call Logistic Regression with random state 0
lg =  LogisticRegression(random_state=0)

In [16]:
#Fit X_train and y_train
lg.fit(X_train,y_train)

LogisticRegression(random_state=0)

In [17]:
#Predict 
y_pred = lg.predict(X_test)

In [18]:
y_pred

array([1285,  660, 1035, 1285,  680, 1285, 1285,  515,  562,  520,  520,
        515, 1065, 1035,  660,  562, 1285, 1065,  520, 1150, 1035,  680,
       1285,  562,  520,  562,  680,  520,  625,  560, 1285, 1065,  520,
       1280, 1285, 1035], dtype=int64)

### Confusion Matrix

In [19]:
cm = confusion_matrix(y_test,y_pred)

In [20]:
cm

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)