# Feature extraction

Feature extraction starts with the initial set of measured data and intended to build values to be informative and non-redundant.Feature extraction is an important part of data preprocessing. It means that extracting features from the data that can capture maximum variance in the data. 

Feature extraction can help us in dimensionality reduction, denoising of the data, etc.

Feature extarction can be done with the help of Principal Component Analysis. 

The key difference between feature selection and extraction is that feature selection keep the subset of original features filtering out the redundant features whereas features extraction  creates the reduced brand new data set.

In [None]:
#####Load python library
# Importing pandas to perform operations using DataFrames 
import pandas as pd  

# Importing numpy to perform Matrix operations 
import numpy as np

# Importing matplotlib to plot graphs
import matplotlib.pyplot as plt

In [None]:
# Importing the following libraries for preprocessing
from sklearn.preprocessing import StandardScaler

# Importing the library for PCA
from sklearn.decomposition import PCA as sklearnPCA

# This code will be used to extract features from 'electric_motor data'
1. This code incorporates PCA on 'electric_motor' 
2. Identify the PCs which capture maximum variablity (95% variance of original data)

In [None]:
# importing electric motor data set
electric_motor_data=pd.read_csv('../input/electric-motor-temperature/pmsm_temperature_data.csv')

In [None]:
# Information on the data
print (electric_motor_data.info())
print ('\n')

In [None]:
electric_motor_data.isnull().sum()

In [None]:
electric_motor_data.describe()

In [None]:
# creating a list of columns of the original data 
electric_motor_data_columns_list = list(electric_motor_data.columns)
print (electric_motor_data_columns_list)
print ('\n')

In [None]:
# dropping column 'Id' 
electric_motor_data = electric_motor_data.drop(['profile_id'],axis=1)
print(electric_motor_data)

In [None]:
# Scaling data using (x-mu)/sigma 
scaler                       = StandardScaler()
Input_electric_motor_data_columns_list = list(electric_motor_data.columns)
electric_motor_data[Input_electric_motor_data_columns_list] = scaler.fit_transform(electric_motor_data[Input_electric_motor_data_columns_list])
print (electric_motor_data)

In [None]:
# computing covariance using scaled data (renamed the data as 'input_data')
input_data                   = electric_motor_data[Input_electric_motor_data_columns_list]
covariance_matrix            = input_data.cov()
print (covariance_matrix)

In [None]:
# Computing Eigen values and Eigen vectors of the Covariance Matrix 
eig_vals, eig_vecs = np.linalg.eig(covariance_matrix[Input_electric_motor_data_columns_list].values)
len(eig_vals)
eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i])for i in range(len(eig_vals))]
#abs - absolute value
eig_pairs.sort(key = lambda x: x[0], reverse=True)# sort eig_pairs in descending order based on the eigen values

In [None]:
print('Eigenvalues in descending order:')
for i in eig_pairs:
    print(i[0])

In [None]:
# setting threshold as '95% variance'  
threshold = 0.95

In [None]:
# Computing number of PCS required to captured specified variance
print('Explained variance in percentage:\n')
Total_variance = 0.0
count          = 0
eigv_sum       = np.sum(eig_vals)
for i,j in enumerate(eig_pairs):
    variance_explained = (j[0]/eigv_sum).real
    print('eigenvalue {}: {}'.format(i+1, (j[0]/eigv_sum).real*100 ))
    Total_variance     = Total_variance+variance_explained
    count              = count+1
# using break command to comeout of the 'for' loop after meeting the threshold
    if (Total_variance>=threshold):
        break
print(Total_variance)

In [None]:
len(eig_vecs)
count

In [None]:
# select required PCs based on the count  - projection matrix w=d*k
reduced_dimension   = np.zeros((len(eig_vecs),count))
for i in range(count):
    reduced_dimension[:,i]= eig_pairs[i][1]

In [None]:
# Projecting the scaled data onto the reduced space (using eigen vectors)
projected_data = electric_motor_data[Input_electric_motor_data_columns_list].values.dot(reduced_dimension)
projected_dataframe = pd.DataFrame(projected_data,
                                   columns=['Feature_1','Feature_2','Feature_3','Feature_4','Feature_5','Feature_6'])

In [None]:
projected_dataframe.head()