---
---
# 1) Exploratory Data Analysis
This notebook examines the data in more detail. Distinguishing patterns are sought through various groupings and aggregations. Multiple visualizations including bar plots and heat maps show pronounced differences.
* Accelerated runtime not required

---
# 2) Imports

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rcParams
%matplotlib inline

# Set global plot values
rcParams['figure.facecolor'] = 'lightgray'
rcParams['figure.figsize'] = (13, 5)

---
# 3) Load & View Data
> *The 'hash' column is not required*

In [None]:
train = pd.read_csv('../input/virusmnist/train.csv')
train.drop('hash', axis = 1, inplace = True)

test = pd.read_csv('../input/virusmnist/test.csv')
labels = pd.read_csv('../input/virusmnist/trainLabels.csv')

print('Training set:\n')
train.head(3)

In [None]:
print('Test set:\n')
test.head(3)

In [None]:
print(f'Labels:\n{labels}')

In [None]:
print('Train set missing:', train.isna().sum().sum())
print('Test set missing:',test.isna().sum().sum())

In [None]:
# Create groupings for visualization
pixel_means = train.groupby('label', axis = 0).mean()
pixel_stds = train.groupby('label', axis = 0).std()
pixel_medians = train.groupby('label', axis = 0).median()

print(f'Pixel means:\n{pixel_means}\n')
print(f'Pixel standard deviations:\n{pixel_stds}\n')
print(f'Pixel medians:\n{pixel_medians}')

---
# 4) Exploration

In [None]:
print('The test/train ratio: ', len(test) / len(train))

In [None]:
plt.figure()
train['label'].plot.hist(bins = 10, ec = 'k')
test['label'].plot.hist(bins = 10, ec = 'k')
plt.legend(['train', 'test'])
_ = plt.title('Train & Test Distribution Comparison')
plt.show()

Train and test sets share similar distributions. 

## Correlation & Covariance

In [None]:
# Pearson r correlations
print('Correlation:')
plt.figure()
sns.heatmap(train.corr(), vmin = -1, vmax = 1)
plt.title('Pearson r Correlations')
plt.show()

# Covariance
print('\nCovariance:')
plt.figure()
sns.heatmap(train.cov())
plt.title('Covariances')
plt.show()

## Sums

In [None]:
sums = train.groupby('label').sum().T.describe().loc[['mean', 'min', 'max','std'], :].T

lengths = train.groupby('label').count().iloc[:, 0]

print(f'Lengths:\n{lengths}\n\nSums:\n\n{sums}')

In [None]:
fig, (ax1, ax2) = plt.subplots(2, 1, tight_layout = True, 
                              sharex = True)
train.groupby('label').sum().sum(axis = 1).plot.bar(ec = 'k', 
                                                   ax = ax1)

train.groupby('label').sum().sum(axis = 1).div(lengths).plot.bar(ec = 'k', 
                                                                ax = ax2)
plt.suptitle('Original & Size Corrected Pixel Sums')
plt.ylabel('Corrected')
plt.show()

In [None]:
print('Pixel sums by class:')

sums.drop('std', axis = 1).sort_values(by = 'mean', 
                                       ascending = False).T.plot.bar(ec = 'k')
plt.title('Trends in Aggregate Statistics')
plt.show()

sums[['std']].sort_values(by = 'std', ascending = False).T.plot.bar(ec = 'k')
plt.title('Standard Deviation of Pixel Sums')
plt.show()

In [None]:
plt.figure(figsize = (17, 9))
sns.heatmap(train.groupby('label').sum())
plt.title('Pixel Sums by Class')
_ = plt.show()

## Descriptive Statistics Comparison

> ### Means

In [None]:
means = pixel_means.T.describe().loc[['mean', 'min', 'max','std'], :].T

print('Pixel means by class:\n')

means.drop('std', 
           axis = 1).sort_values(by = 'mean', 
                                 ascending = False).T.plot.bar(ec = 'k')
_ = plt.title('Average, Min, and Max of Pixel Means')
plt.show()


means[['std']].sort_values(by = 'std', 
                           ascending = False).T.plot.bar(ec = 'k')
_ = plt.title('Standard Deviation of Pixel Means')
plt.show()

In [None]:
plt.figure(figsize = (17, 9))
sns.heatmap(pixel_means)
plt.title('Mean Pixel Value by Class')
_ = plt.show()

> ### Medians

In [None]:
medians = pixel_medians.T.describe().loc[['mean', 'max', 'min', 'std'], :].T
medians

In [None]:
print('Pixel medians by class:')
medians.drop('std', axis = 1).sort_values(by = 'mean', 
                                       ascending = False).T.plot.bar(ec = 'k')
plt.title('Trends & Groups in Pixel Medians')
plt.show()


medians[['std']].sort_values(by = 'std', 
                             ascending = False).T.plot.bar(ec = 'k')
plt.title('Pixel Median Standard Deviation')
plt.show()

In [None]:
plt.figure(figsize = (25, 13))
sns.heatmap(pixel_medians)
plt.title('Median Pixel Value by Class')
_ = plt.show()

> ### Standard Deviation

In [None]:
plt.figure(figsize = (25, 13))
sns.heatmap(pixel_stds)
plt.title('Standard Deviations')
_ = plt.show()

* More heatmaps in dedicated notebook

## Disributions

In [None]:
# Helper function
def class_hist(df):
    # Creates a histogram of a pic from each class
    vals = np.sort(df['label'].unique())
    plt.figure(tight_layout = True)
    # Returns histograms by class
    for i, clss in enumerate(vals):
        plt.subplot(len(vals), 1, i + 1)
        tmp = df[df['label'] == clss].drop('label', axis = 1).iloc[0, :]
        tmp.plot.hist(bins = 255, figsize = (13, 13), edgecolor = 'k')
        plt.ylabel('Class {}'.format(clss))
    plt.suptitle('Sample Class Histograms')
    plt.show()

In [None]:
class_hist(train)

In [None]:
tmp = pixel_means.T

plt.figure()
sns.displot(tmp, bins = 255, height = 5, aspect = 2.8, alpha = .5)
_ = plt.title('Pixel Mean Distributions')
plt.show()

In [None]:
print('Distribution of pixel mean values for each class:\n')
for num, i in enumerate(tmp.columns):
    sns.displot(tmp[i], bins = 255, kde = True, 
                height = 5, aspect = 2)
    plt.xlabel('Pixel Means')
    plt.title('Class ' + str(num))

### Cumulative Distribution Functions

In [None]:
for num, i in enumerate(tmp.columns):
    sns.displot(tmp[i], height = 5, aspect = 2.4, kind = 'ecdf')
    plt.xlabel('Pixel Means')
    plt.title('Class ' + str(num))
    _ = plt.show()

## Ranges

In [None]:
pix_range = train.groupby('label').max().T - train.groupby('label').min().T
ranges = pix_range.describe().T[['mean', 'max', 'min', 'std']]


print('Pixel Range Statistics:\n')
ranges

In [None]:

ranges.drop('std', axis = 1).sort_values(by = 'mean', 
                                         ascending = False).T.plot.bar(figsize = (14, 4), 
                                                                       ec = 'k')
_ = plt.title('Aggregated Statistics')
plt.show()


ranges[['std']].sort_values(by = 'std', 
                            ascending = False).T.plot.bar(figsize = (14, 4), 
                                                           ec = 'k')
_ = plt.title('Range Standard Deviations')
plt.show()

Class 4 shows a clear difference from the others when ranges are compared. Its pixels do not span the full range of values, and show much greater difference in range between pixels. This is reflected in the larger standard deviation seen above.

---
# 5) Conclusion

Distinguishing characteristics are identified for certain comparisons, such as the range of pixel values in class four. CDFs also show differences in how each virus matrix is populated. Some show linear growth while others embody the logistic (sigmoid) function. Another separating factor is the distribution of means for each target. They show a separation in peak mean values which could be exploited for identification. Train and test set proportions are appropriate and should not cause problems in final model evaluation.

The large variation in class size could cause some training confusion. Suggestions to mitigate:
* Stratify on train_test_split
* Bootstrapping to upsample
* Random samples to downsample
* fix_imbalance = True for pycaret
___
___