# Continued EDA: Heatmaps

This notebook contains mutiple heatmaps for exploring trends in the VMNIST examples.

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rcParams
%matplotlib inline

# Set global plot values
rcParams['figure.facecolor'] = 'lightgray'
rcParams['figure.figsize'] = (13, 7)

In [None]:
train = pd.read_csv('../input/virusmnist/train.csv')
names = train[['label', 'hash']]
X = train.drop(['label', 'hash'], axis = 1)
y = train['label']
print(X.shape)

X_tr, X_ts, y_tr, y_ts = train_test_split(X, y, test_size = .1, random_state = 42, 
                                          stratify = y)
print(X_tr.shape, X_ts.shape)

In [None]:
train.drop('hash', axis = 1, inplace = True)

In [None]:
pixel_means = train.groupby('label', axis = 0).mean()
pixel_stds = train.groupby('label', axis = 0).std()
pixel_medians = train.groupby('label', axis = 0).median()
pixel_medians

In [None]:
sns.heatmap(pixel_means)
plt.title('Pixel Means')
plt.show()

In [None]:
sns.heatmap(pixel_stds)
plt.title('Pixel Standard Deviations')
plt.show()

In [None]:
sns.heatmap(pixel_medians)
plt.title('Pixel Medians')
plt.show()

In [None]:

sns.heatmap(train.groupby('label', axis = 0).mean().cummax())
plt.title('Cumulative Max of Pixel Means')

In [None]:

sns.heatmap(train.groupby('label', axis = 0).mean().cummin())
plt.title('Cumulative Min of Pixel Means')
plt.show()

In [None]:

sns.heatmap(train.groupby('label', axis = 0).sem())
plt.title('Pixel SEM')
plt.show()

In [None]:

sns.heatmap(train.groupby('label', axis = 0).sem().pct_change(), 
           vmin = 0.0, vmax = 1.0)
plt.title('Percent Change of SEM')
plt.show()

In [None]:

sns.heatmap(train.groupby('label', axis = 0).mean().pct_change(), 
           vmin = 0.0, vmax = 1.0)
plt.title('Percent Change of Mean')
plt.show()

In [None]:
correlations = X.corr()
correlations

In [None]:

sns.heatmap(correlations, vmin = -1.0, vmax = 1.0)
plt.title('Pearson r Correlation')
plt.show()

In [None]:

sns.heatmap(correlations.iloc[750:, 750:], vmin = -1.0, vmax = 1.0)
plt.title('Lower Right of Correlations Heatmap')
plt.show()

In [None]:
train_sorted = train.sort_values(by = 'label')
train_sorted.reset_index(inplace = True)
train_sorted_corr = train_sorted.drop('label', axis = 1).corr()
train_sorted_corr

In [None]:
train_sorted

In [None]:
plt.figure(figsize = (25, 25))
sns.heatmap(train_sorted.drop('label', axis = 1), vmin = 0.0, vmax = 255.0)
plt.title('Grouped by Class')
plt.show()

In [None]:

sns.heatmap(train_sorted_corr, vmin = -1.0, vmax = 1.0, center = 0.0)
plt.title('Sorted Correlations')
plt.show()

---

# Conclusion

The heatmaps show characteristics that make each class visually identifiable. This will likely translate to the best models ability to learn individual characteristics that distinguish them. It may be possible to select the minimum number of required pruning by pruning to leave only the most relevant feature predicting features.

## *Next Steps* 
* Feature engineering and pruning
 * Find minimum required features and best model with pycaret
 * Test simple neural network architectures with minimum required features
 
---
---