In [1]:
# to figure out the most important features of a model prediction we use Principal Component Analysis

In [2]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [3]:
digits = load_digits()

In [4]:
ip_df = pd.DataFrame(digits.data, columns=digits.feature_names)
op_df = pd.DataFrame(digits.target, columns=['target'])

In [5]:
# scale the values before using pca
from sklearn.preprocessing import StandardScaler
scaled_ip_df = StandardScaler().fit_transform(ip_df)

In [6]:
from sklearn.model_selection import train_test_split
ip_train, ip_test, op_train, op_test = train_test_split(scaled_ip_df, op_df, test_size=0.2, random_state=30)

In [7]:
lr_model = LogisticRegression()
lr_model.fit(ip_train, op_train)
lr_model.score(ip_test, op_test)

  y = column_or_1d(y, warn=True)


0.9722222222222222

In [8]:
pca = PCA(0.95) # retains 95% of useful features
# pca = PCA(n_components=50) # will give 50 combines features

ip_pca = pca.fit_transform(scaled_ip_df)

In [11]:
ip_pca[0]

array([-1.91421366, -0.95450157, -3.94603482, -2.02872332, -0.2671728 ,
        0.53032688, -1.41532079,  1.49606164,  0.12491434, -0.82224561,
       -0.49969268, -0.78946619,  0.22812496, -0.19514231,  0.83704037,
        0.10434134,  0.18532308, -0.09005675,  0.41275404,  0.43051695,
        0.45099368,  0.55870308,  0.50882594, -0.75889619, -0.46450005,
        0.732176  , -0.32731796, -0.15702206,  0.40491755, -0.32459432,
        0.53468255,  0.01797902, -0.04795038, -0.01912424, -0.11718993,
       -0.70904873, -0.08241012,  0.81405925,  0.0249306 , -0.32193146])

In [12]:
# now we have 40 columns from 64

In [13]:
ip_train, ip_test, op_train, op_test = train_test_split(ip_pca, op_df, test_size=0.2, random_state=30)

In [14]:
lr_model = LogisticRegression()
lr_model.fit(ip_train, op_train)
lr_model.score(ip_test, op_test)

  y = column_or_1d(y, warn=True)


0.9638888888888889

# Load Breast Cancer

In [15]:
from sklearn.datasets import load_breast_cancer

In [16]:
cancer = load_breast_cancer()
cancer

{'data': array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
         1.189e-01],
        [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
         8.902e-02],
        [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
         8.758e-02],
        ...,
        [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
         7.820e-02],
        [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
         1.240e-01],
        [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
         7.039e-02]]),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
        1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
        1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0

In [19]:
features = cancer.feature_names
for i in range(len(features)):
    features[i] = features[i].replace(' ', '_')
features

array(['mean_radius', 'mean_texture', 'mean_perimeter', 'mean_area',
       'mean_smoothness', 'mean_compactness', 'mean_concavity',
       'mean_concave_points', 'mean_symmetry', 'mean_fractal_dimension',
       'radius_error', 'texture_error', 'perimeter_error', 'area_error',
       'smoothness_error', 'compactness_error', 'concavity_error',
       'concave_points_error', 'symmetry_error',
       'fractal_dimension_error', 'worst_radius', 'worst_texture',
       'worst_perimeter', 'worst_area', 'worst_smoothness',
       'worst_compactness', 'worst_concavity', 'worst_concave_points',
       'worst_symmetry', 'worst_fractal_dimension'], dtype='<U23')

In [20]:
ip_df = pd.DataFrame(cancer.data, columns=features)
op_df = pd.DataFrame(cancer.target, columns=['target'])

In [26]:
ip_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean_radius              569 non-null    float64
 1   mean_texture             569 non-null    float64
 2   mean_perimeter           569 non-null    float64
 3   mean_area                569 non-null    float64
 4   mean_smoothness          569 non-null    float64
 5   mean_compactness         569 non-null    float64
 6   mean_concavity           569 non-null    float64
 7   mean_concave_points      569 non-null    float64
 8   mean_symmetry            569 non-null    float64
 9   mean_fractal_dimension   569 non-null    float64
 10  radius_error             569 non-null    float64
 11  texture_error            569 non-null    float64
 12  perimeter_error          569 non-null    float64
 13  area_error               569 non-null    float64
 14  smoothness_error         5

In [29]:
ip_train, ip_test, op_train, op_test = train_test_split(ip_df, op_df, test_size=0.2, random_state=30)

In [30]:
cancer_model = LogisticRegression(max_iter=10000)
cancer_model.fit(ip_train, op_train)
cancer_model.score(ip_test, op_test)

  y = column_or_1d(y, warn=True)


0.8947368421052632

In [31]:
# use PCA

pca = PCA(0.95)
ip_pca = pca.fit_transform(ip_df)

ip_train, ip_test, op_train, op_test = train_test_split(ip_pca, op_df, test_size=0.2, random_state=30)

In [32]:
cancer_model = LogisticRegression(max_iter=10000)
cancer_model.fit(ip_train, op_train)
cancer_model.score(ip_test, op_test)

  y = column_or_1d(y, warn=True)


0.9122807017543859

In [33]:
# somehow the score increased after using PCA, noice