# Monarch Reflectance Data Cleaning 

#### Purpose: prepare monarch reflectance data for analysis

In [1]:
#import tools for data cleaning
import pandas as pd
import numpy as np

In [2]:
#read in the dataset of reflectance across wavelengths
reflectance = pd.read_csv('../../data/input/monarch_reflectance.csv')  #

#visual inspection
print(reflectance.head(20))

   ORIGINAL DATA SCAN # RECORDED SCAN # Monarch                   Group Sex  \
0                   NaN             NaN     290    SUMMER BREEDING MALE   M   
1                   NaN             NaN     294  SUMMER BREEDING FEMALE   F   
2                   NaN             NaN     285    SUMMER BREEDING MALE   M   
3                   NaN             NaN     276    SUMMER BREEDING MALE   M   
4                   NaN             NaN     291  SUMMER BREEDING FEMALE   F   
5                   NaN             NaN     282  SUMMER BREEDING FEMALE   F   
6                   NaN             NaN     274  SUMMER BREEDING FEMALE   F   
7                   NaN             NaN     128  SUMMER BREEDING FEMALE   F   
8                   NaN             NaN     164  SUMMER BREEDING FEMALE   F   
9                   NaN             NaN     179  SUMMER BREEDING FEMALE   F   
10                  NaN             NaN    OH65            MIGRANT MALE   M   
11                  NaN             NaN   OH116     

#### the dataset contains several features that we actually wont use in this analysis:
- ORIGINAL DATA SCAN #
- RECORDED SCAN #
- Monarch
- Group
- INFO
- INFECTED
- NOTES

#### those features are then removed using df.drop()

In [3]:
#narrow the dataset to the categorical field for sex and the reflectance data.
reflectances = reflectance[reflectance['Group'] != 'ORIENTATION TEST']
reflectances = reflectances.drop([
    'ORIGINAL DATA SCAN #', 'RECORDED SCAN #', 'Monarch', 'Group', 'INFO',
    'INFECTED', 'NOTES'],axis=1)

reflectances.sample(10)

Unnamed: 0,Sex,338.9,340.4,341.9,343.5,345,346.5,348,349.5,351,...,2498.2,2500.3,2502.4,2504.6,2506.7,2508.9,2511,2513.1,2515.3,2517.4
38,M,8.7,8.333333,7.753333,7.35,6.91,6.513333,6.093333,5.776667,5.426667,...,19.67,19.633333,19.626667,19.606667,19.573333,19.563333,19.563333,19.536667,19.53,19.563333
31,,4.085,4.3625,4.14,3.9125,4.1675,4.26,4.295,4.305,4.235,...,2.2375,2.2425,2.22,2.235,2.22,2.2425,2.24,2.28,2.2725,2.29
3,M,10.0,10.113333,9.636667,8.826667,8.216667,7.236667,6.726667,6.39,6.123333,...,18.023333,18.006667,17.93,17.88,17.853333,17.84,17.803333,17.783333,17.79,17.826667
22,M,9.583333,8.883333,8.46,8.456667,7.983333,7.456667,6.926667,6.48,5.613333,...,22.62,22.603333,22.6,22.616667,22.613333,22.603333,22.596667,22.6,22.583333,22.583333
72,F,7.63,7.443333,7.31,7.126667,6.666667,6.3,6.236667,5.923333,5.686667,...,14.476667,14.466667,14.493333,14.476667,14.443333,14.44,14.43,14.423333,14.383333,14.4
42,M,11.396667,10.99,10.646667,9.98,9.35,8.333333,7.796667,7.18,6.88,...,21.126667,21.093333,21.116667,21.11,21.09,21.086667,21.103333,21.093333,21.036667,21.076667
70,F,7.703333,7.66,7.413333,6.993333,6.15,5.876667,5.956667,5.583333,5.543333,...,14.926667,14.93,14.896667,14.88,14.866667,14.866667,14.853333,14.84,14.846667,14.873333
7,F,6.55,6.36,6.1825,5.9075,5.5975,5.8,5.4025,5.0525,4.8075,...,12.785,12.7525,12.7275,12.7125,12.7025,12.6925,12.6925,12.695,12.6925,12.7125
41,M,9.556667,8.816667,8.22,8.223333,7.62,7.566667,7.26,6.52,5.96,...,16.44,16.396667,16.39,16.383333,16.3,16.256667,16.25,16.233333,16.213333,16.233333
66,F,8.05,7.363333,6.986667,6.796667,6.763333,6.963333,6.516667,5.98,5.613333,...,23.366667,23.36,23.336667,23.336667,23.323333,23.32,23.306667,23.29,23.286667,23.313333


In [4]:
#### with the dataset now containing the reflectances at each wavelength, we need to check for any missing data

#sum of null values divided by non-null values per column, sorted top 10 values
print((reflectances.isnull().sum() /
       reflectances.isnull().count()).sort_values(ascending=False)[0:9])

Sex      0.030303
790.5    0.000000
807.1    0.000000
805.8    0.000000
804.5    0.000000
803.3    0.000000
802      0.000000
800.7    0.000000
799.5    0.000000
dtype: float64


#### It looks like our label 'Sex' has some missing data (3% missing). That sex field contains the label we ultimately seek to classify. Records missing label values will be removed for the analysis


In [5]:
#drop rows with a null value in any field
reflectances = reflectances.dropna(axis=0)

#check for null values
print(reflectances.isnull().sum())

#check dimensions
print(reflectances.shape)

Sex       0
338.9     0
340.4     0
341.9     0
343.5     0
         ..
2508.9    0
2511      0
2513.1    0
2515.3    0
2517.4    0
Length: 995, dtype: int64
(96, 995)


#### Now, the dataframe is complete and has no null values. The next step is to check for outliers in the reflectance fields

In [6]:
#mean-based outlier detection
#this assumes a normal distribution within each feature, so you could scale the data or pursue another method
from scipy import stats
features = reflectances.iloc[:, 1:]

z = np.abs(stats.zscore(features))
z_o = z[np.where(z > 3)]

#set an outlier threshold of 3 standard deviations from the mean
threshold = 3

#filter out any rows with a z score in a column above threshold
reflectances_sub_3z = reflectances[reflectances.iloc[:, 1:].apply(
    lambda x: np.abs(x - x.mean()) / x.std() < threshold).all(axis=1)]


In [7]:
#IQR-based outlier detection
thresh_25 = features.quantile(0.25)
thresh_75 = features.quantile(0.75)

#store IQR
iqr = abs(thresh_75 - thresh_25)

#determine outlier thresholds by multiplying iqr * 1.5 and adding to or subtracting from field mean
outlier_thresh_upper = (1.5*iqr)+thresh_75
outlier_thresh_lower = thresh_25 - (1.5*iqr)

# create mask to denote if each field value is an outlier or not
outlier_mask = ((features >=outlier_thresh_upper) | (features <= outlier_thresh_lower))

#fill with col mean?
features[~outlier_mask].fillna(features.mean())

#fill with median?
features[~outlier_mask].fillna(features.median())


Unnamed: 0,338.9,340.4,341.9,343.5,345,346.5,348,349.5,351,352.5,...,2498.2,2500.3,2502.4,2504.6,2506.7,2508.9,2511,2513.1,2515.3,2517.4
0,7.622500,7.537500,7.065000,6.560000,6.262500,5.820000,5.540000,5.150000,4.807500,4.772500,...,9.620000,9.612500,9.607500,9.577500,9.580000,9.562500,9.552500,9.562500,9.570000,9.622500
1,7.160000,6.977500,6.850000,6.595000,6.420000,6.050000,5.675000,5.125000,4.845000,4.642500,...,18.572500,18.545000,18.492500,18.447500,18.457500,18.457500,18.437500,18.410000,18.400000,18.430000
2,9.723333,9.703333,9.376667,8.823333,8.453333,8.223333,7.330000,7.223333,6.716667,6.746667,...,14.566667,14.550000,14.560000,14.540000,14.526667,14.513333,14.486667,14.500000,14.520000,14.543333
3,10.000000,10.113333,9.636667,8.826667,8.216667,7.236667,6.726667,6.390000,6.123333,6.270000,...,18.023333,18.006667,17.930000,17.880000,17.853333,17.840000,17.803333,17.783333,17.790000,17.826667
4,6.903333,6.786667,6.630000,6.163333,5.890000,5.780000,5.473333,5.220000,5.050000,4.810000,...,15.706667,15.683333,15.653333,15.623333,15.616667,15.590000,15.600000,15.580000,15.586667,15.636667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,8.590000,8.306667,8.130000,7.726667,7.936667,7.340000,7.136667,6.576667,6.136667,5.906667,...,25.526667,25.530000,25.483333,25.500000,25.523333,25.486667,25.470000,25.493333,25.543333,25.520000
95,10.693333,9.946667,9.376667,9.190000,9.523333,8.446667,8.333333,7.850000,7.620000,7.333333,...,17.723333,17.740000,17.726667,17.700000,17.700000,17.693333,17.686667,17.680000,17.690000,17.730000
96,9.723333,9.276667,8.986667,8.453333,8.333333,7.890000,7.400000,6.786667,6.360000,6.250000,...,14.400000,14.400000,14.370000,14.363333,14.390000,14.380000,14.360000,14.380000,14.406667,14.430000
97,10.556667,10.213333,9.900000,9.436667,9.046667,8.443333,8.230000,7.946667,7.440000,7.166667,...,16.356667,16.350000,16.320000,16.350000,16.383333,16.393333,16.390000,16.366667,16.356667,16.413333


-----

##### While I could export the data with outliers transformed, the random forest approach is not as sensitive to outliers so I chose to leave them for the modeling stage

-----

In [8]:
#While we could use the methods with outliers filtered out, the analysis, random forest, is robust to outliers
#therefor
reflectances.to_csv('../../data/input/reflectances_cleaned.csv', index=False)