In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Why Feature extraction ?**

* Accuracy improvement
* overfitting risk reduction
* speed up training
* improved data visualization
* to increase explainability of model

# **Importing the required libraries**

In [None]:
import matplotlib.pyplot as plt
import time
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

# **Importing the data**

In [None]:
data= pd.read_csv("../input/mushroom-classification/mushrooms.csv")
data.head()

# **Missing values**

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(data.isnull(), yticklabels=False) # no null values in the dataset"

# **Seperating features (x) and Labels (y)**

In [None]:
x= data.drop(columns='class')
y= data['class']
x= pd.get_dummies(x, prefix_sep="_")
x.head()

# **Scaling and Encoding**

In [None]:
x= StandardScaler().fit_transform(x)
y= LabelEncoder().fit_transform(y)

**We will make function to split the data, train model and caluclate the score**

In [None]:
def forest_test(x,y):
    x_train, x_test, y_train, y_test= train_test_split(x,y,test_size=0.3, random_state=101)
    start= time.process_time()
    clf= RandomForestClassifier(n_estimators=700).fit(x_train, y_train)
    print(time.process_time()-start)
    pred= clf.predict(x_test)
    print(confusion_matrix(y_test, pred))
    print(classification_report(y_test, pred))

In [None]:
forest_test(x,y)

# **model gives 100% accuracy if we use all the features**

# **Feature extraction**

# Principal Component Analysis (PCA)

# it is a most widely used linear dimensionality reduction technique. 
# In PCA we will input the oroiginal features and try to find the combination of features best summarise the original featuresa


# **1. we will reduce the dataset into only two features**

In [None]:
from sklearn.decomposition import PCA

pca= PCA(n_components=2)
x_pca_2= pca.fit_transform(x)
pca_df= pd.DataFrame(data=x_pca_2, columns=["PC1", 'PC2'])
pca_df= pd.concat([pca_df, data['class']], axis=1)
pca_df['class']= LabelEncoder().fit_transform(pca_df['class'])
pca_df.head()

In [None]:
plt.figure(figsize=(8,8), dpi=80, facecolor='w', edgecolor='k')

classes= [1,0]
colors= ['r', 'b']
for clas, color in zip(classes, colors):
    plt.scatter(pca_df.loc[pca_df['class'] == clas, 'PC1'],
               pca_df.loc[pca_df['class'] == clas, 'PC2'],
               c= color)

plt.xlabel('principal component 1', fontsize= 12)
plt.ylabel('principal component 2', fontsize= 12)
plt.title('2D PCA', fontsize= 15)
plt.legend(['Poisonous', 'Edible'])
plt.grid()

In [None]:
forest_test(x_pca_2, y)

# **we got 95% score by using just 2 features**

# **3 Features PCA**

In [None]:
pca= PCA(n_components=3, svd_solver='full')
x_pca= pca.fit_transform(x)
print(pca.explained_variance_)

forest_test(x_pca, y)

In [None]:
import plotly.express as px
pca_df= pd.DataFrame(data=x_pca, columns=["PC1", 'PC2', 'PC3'])
df = pd.concat([pca_df, data['class']], axis=1)
fig = px.scatter_3d(df, x='PC1', y='PC2', z='PC3',
              color='class',labels= ['Poisonous', 'edible'])
fig.show()

# **We got 98% by using 3 features**

In [None]:
from itertools import product

X_Reduced, X_Test_Reduced, Y_Reduced, Y_Test_Reduced = train_test_split(x_pca_2, y, 
                                                                        test_size = 0.30, 
                                                                        random_state = 101)
trainedforest = RandomForestClassifier(n_estimators=700).fit(X_Reduced,Y_Reduced)

x_min, x_max = X_Reduced[:, 0].min() - 1, X_Reduced[:, 0].max() + 1
y_min, y_max = X_Reduced[:, 1].min() - 1, X_Reduced[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1))
Z = trainedforest.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z,cmap=plt.cm.coolwarm, alpha=0.4)
plt.scatter(X_Reduced[:, 0], X_Reduced[:, 1], c=Y_Reduced, s=20, edgecolor='k')
plt.xlabel('Principal Component 1', fontsize = 12)
plt.ylabel('Principal Component 2', fontsize = 12)
plt.title('Random Forest', fontsize = 15)
plt.show()

# **Independent Component Analyasis (ICA)**

**ICA is linear dimensionality reduction method which takes as input data a mixture of independent components and it try to correctly identify each of them.**

In [None]:
from sklearn.decomposition import FastICA

ica= FastICA(n_components=3)
x_ica= ica.fit_transform(x)
forest_test(x_ica, y)

# **Linear Discriminant Analysis (LDA)**

**LDA is supervised learning dimensionality reduction technique and machine learning classifier**
1. It maximize the distance between the mean of each class
2. minimize the spread within the class 

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis(n_components=1)

# run an LDA and use it to transform the features
X_lda = lda.fit(x,y).transform(x)
print('Original number of features:', x.shape[1])
print('Reduced number of features:', X_lda.shape[1])

In [None]:
forest_test(X_lda, y)

In [None]:
X_Reduced, X_Test_Reduced, Y_Reduced, Y_Test_Reduced = train_test_split(X_lda, y, 
                                                                        test_size = 0.30, 
                                                                        random_state = 101)

start = time.process_time()
lda = LinearDiscriminantAnalysis().fit(X_Reduced,Y_Reduced)
print(time.process_time() - start)
predictionlda = lda.predict(X_Test_Reduced)
print(confusion_matrix(Y_Test_Reduced,predictionlda))
print(classification_report(Y_Test_Reduced,predictionlda))

# **Locally Linear Embedding**

it is the dimensionalty reduction method based on manifold learning which is used in case of non lineaer features



In [None]:
from sklearn.manifold import LocallyLinearEmbedding

embedding = LocallyLinearEmbedding(n_components=3, eigen_solver='dense')
x_lle= embedding.fit_transform(x)

forest_test(x_lle,y)

# **t-Distributed Stochastic Neighbor Embedding (t-SNE)**

It is non-linear dimenaionality reduction technique whic is typically used to visualize high dimensional datasets

In [None]:
from sklearn.manifold import TSNE
start= time.process_time()
tsne= TSNE(n_components=3, verbose= 1, perplexity=40, n_iter=300)
x_tsne= tsne.fit_transform(x)

print(time.process_time()-start)

In [None]:
forest_test(x_tsne, y)

# **Autoencoders**

# Autoencoders are family of machine learning algorithms which can be used to reduce the dimensionality of the higher dimensional dataset  

In [None]:
from keras.layers import Input, Dense
from keras.models import Model

input_layer= Input(shape=(x.shape[1],))
encoded= Dense(3, activation='relu')(input_layer)
decoded= Dense(x.shape[1], activation='softmax')(encoded)
autoencoder= Model(input_layer, decoded)
autoencoder.compile(optimizer='adam', loss='binary_crossentropy')

x1, x2, y1, y2= train_test_split(x,x,test_size=0.3, random_state=101)

autoencoder.fit(x1, y1, 
               epochs= 100, 
               batch_size=300, 
               shuffle= True, 
               verbose= 30, 
               validation_data=(x2, y2))
encoder= Model(input_layer, encoded)
x_ae= encoder.predict(x)

In [None]:
forest_test(x_ae, y)