# Monarch Reflectance Data Cleaning 

#### Purpose: prepare monarch reflectance data for analysis

In [29]:
#import tools for data cleaning
import pandas as pd
import numpy as np

In [30]:
#read in the dataset of reflectance across wavelengths
reflectance = pd.read_csv('../../data/input/monarch_reflectance.csv')  #

#visual inspection
print(reflectance.head(20))

   ORIGINAL DATA SCAN # RECORDED SCAN # Monarch                   Group Sex  \
0                   NaN             NaN     290    SUMMER BREEDING MALE   M   
1                   NaN             NaN     294  SUMMER BREEDING FEMALE   F   
2                   NaN             NaN     285    SUMMER BREEDING MALE   M   
3                   NaN             NaN     276    SUMMER BREEDING MALE   M   
4                   NaN             NaN     291  SUMMER BREEDING FEMALE   F   
5                   NaN             NaN     282  SUMMER BREEDING FEMALE   F   
6                   NaN             NaN     274  SUMMER BREEDING FEMALE   F   
7                   NaN             NaN     128  SUMMER BREEDING FEMALE   F   
8                   NaN             NaN     164  SUMMER BREEDING FEMALE   F   
9                   NaN             NaN     179  SUMMER BREEDING FEMALE   F   
10                  NaN             NaN    OH65            MIGRANT MALE   M   
11                  NaN             NaN   OH116     

#### the dataset contains several features that we actually wont use in this analysis:
- ORIGINAL DATA SCAN #
- RECORDED SCAN #
- Monarch
- Group
- INFO
- INFECTED
- NOTES

#### those features are then removed using df.drop()

In [31]:
#narrow the dataset to the categorical field for sex and the reflectance data.
reflectances = reflectance[reflectance['Group'] != 'ORIENTATION TEST']
reflectances = reflectances.drop([
    'ORIGINAL DATA SCAN #', 'RECORDED SCAN #', 'Monarch', 'Group', 'INFO',
    'INFECTED', 'NOTES'],axis=1)

reflectances.sample(10)

Unnamed: 0,Sex,338.9,340.4,341.9,343.5,345,346.5,348,349.5,351,...,2498.2,2500.3,2502.4,2504.6,2506.7,2508.9,2511,2513.1,2515.3,2517.4
16,M,10.696667,10.523333,9.636667,8.946667,8.913333,8.66,7.83,7.126667,6.463333,...,24.836667,24.796667,24.813333,24.826667,24.793333,24.773333,24.81,24.786667,24.72,24.833333
86,F,6.78,6.806667,6.586667,5.766667,5.673333,5.706667,5.236667,5.193333,4.44,...,25.333333,25.343333,25.37,25.353333,25.313333,25.29,25.31,25.356667,25.3,25.333333
59,F,5.503333,5.333333,5.246667,5.333333,5.39,4.92,4.596667,4.59,4.456667,...,21.053333,21.04,21.06,21.023333,21.006667,21.02,21.033333,21.043333,21.033333,21.076667
36,M,8.826667,8.453333,8.1,7.893333,7.62,7.086667,6.72,5.696667,5.503333,...,13.52,13.503333,13.476667,13.453333,13.44,13.44,13.436667,13.43,13.43,13.43
75,F,7.766667,7.776667,7.793333,7.436667,7.37,6.506667,5.983333,5.783333,5.616667,...,22.706667,22.663333,22.66,22.64,22.623333,22.626667,22.61,22.586667,22.553333,22.613333
88,F,8.986667,9.09,8.96,8.546667,7.47,7.193333,6.863333,6.48,6.366667,...,21.833333,21.823333,21.826667,21.843333,21.83,21.81,21.82,21.793333,21.736667,21.726667
89,F,8.726667,8.246667,7.82,7.846667,7.613333,7.193333,6.993333,6.3,5.873333,...,22.91,22.916667,22.933333,22.92,22.893333,22.903333,22.933333,22.94,22.893333,22.91
51,M,8.703333,8.453333,7.986667,7.676667,6.91,6.416667,6.45,5.94,5.653333,...,15.286667,15.263333,15.256667,15.273333,15.24,15.266667,15.306667,15.28,15.23,15.31
50,M,9.803333,9.783333,9.03,8.33,7.723333,7.566667,6.9,6.273333,6.19,...,13.486667,13.47,13.483333,13.47,13.413333,13.42,13.443333,13.44,13.42,13.466667
44,M,11.396667,11.23,10.88,10.31,9.656667,9.003333,8.513333,8.17,7.646667,...,16.073333,16.016667,16.03,16.046667,16.013333,15.986667,16.013333,15.993333,15.96,15.976667


In [32]:
#### with the dataset now containing the reflectances at each wavelength, we need to check for any missing data

#sum of null values divided by non-null values per column, sorted top 10 values
print((reflectances.isnull().sum() /
       reflectances.isnull().count()).sort_values(ascending=False)[0:9])

Sex      0.030303
790.5    0.000000
807.1    0.000000
805.8    0.000000
804.5    0.000000
803.3    0.000000
802      0.000000
800.7    0.000000
799.5    0.000000
dtype: float64


#### It looks like our label 'Sex' has some missing data (3% missing). That sex field contains the label we ultimately seek to classify. Records missing label values will be removed for the analysis


In [33]:
#drop rows with a null value in any field
reflectances = reflectances.dropna(axis=0)

#check for null values
print(reflectances.isnull().sum())

#check dimensions
print(reflectances.shape)

Sex       0
338.9     0
340.4     0
341.9     0
343.5     0
         ..
2508.9    0
2511      0
2513.1    0
2515.3    0
2517.4    0
Length: 995, dtype: int64
(96, 995)


#### Now, the dataframe is complete and has no null values. The next step is to check for outliers in the reflectance fields

In [34]:
#mean-based outlier detection
#this assumes a normal distribution within each feature, so you could scale the data or pursue another method
from scipy import stats
features = reflectances.iloc[:, 1:]

z = np.abs(stats.zscore(features))
z_o = z[np.where(z > 3)]

#set an outlier threshold of 3 standard deviations from the mean
threshold = 3

#filter out any rows with a z score in a column above threshold
reflectances_sub_3z = reflectances[reflectances.iloc[:, 1:].apply(
    lambda x: np.abs(x - x.mean()) / x.std() < threshold).all(axis=1)]


In [35]:
#IQR-based outlier detection
thresh_25 = features.quantile(0.25)
thresh_75 = features.quantile(0.75)

#store IQR
iqr = abs(thresh_75 - thresh_25)

#determine outlier thresholds by multiplying iqr * 1.5 and adding to or subtracting from field mean
outlier_thresh_upper = (1.5*iqr)+thresh_75
outlier_thresh_lower = thresh_25 - (1.5*iqr)

# create mask to denote if each field value is an outlier or not
outlier_mask = ((features >=outlier_thresh_upper) | (features <= outlier_thresh_lower))

#fill with col mean?
features[~outlier_mask].fillna(features.mean())

#fill with median?
features[~outlier_mask].fillna(features.median())


Unnamed: 0,338.9,340.4,341.9,343.5,345,346.5,348,349.5,351,352.5,...,2498.2,2500.3,2502.4,2504.6,2506.7,2508.9,2511,2513.1,2515.3,2517.4
0,7.622500,7.537500,7.065000,6.560000,6.262500,5.820000,5.540000,5.150000,4.807500,4.772500,...,9.620000,9.612500,9.607500,9.577500,9.580000,9.562500,9.552500,9.562500,9.570000,9.622500
1,7.160000,6.977500,6.850000,6.595000,6.420000,6.050000,5.675000,5.125000,4.845000,4.642500,...,18.572500,18.545000,18.492500,18.447500,18.457500,18.457500,18.437500,18.410000,18.400000,18.430000
2,9.723333,9.703333,9.376667,8.823333,8.453333,8.223333,7.330000,7.223333,6.716667,6.746667,...,14.566667,14.550000,14.560000,14.540000,14.526667,14.513333,14.486667,14.500000,14.520000,14.543333
3,10.000000,10.113333,9.636667,8.826667,8.216667,7.236667,6.726667,6.390000,6.123333,6.270000,...,18.023333,18.006667,17.930000,17.880000,17.853333,17.840000,17.803333,17.783333,17.790000,17.826667
4,6.903333,6.786667,6.630000,6.163333,5.890000,5.780000,5.473333,5.220000,5.050000,4.810000,...,15.706667,15.683333,15.653333,15.623333,15.616667,15.590000,15.600000,15.580000,15.586667,15.636667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,8.590000,8.306667,8.130000,7.726667,7.936667,7.340000,7.136667,6.576667,6.136667,5.906667,...,25.526667,25.530000,25.483333,25.500000,25.523333,25.486667,25.470000,25.493333,25.543333,25.520000
95,10.693333,9.946667,9.376667,9.190000,9.523333,8.446667,8.333333,7.850000,7.620000,7.333333,...,17.723333,17.740000,17.726667,17.700000,17.700000,17.693333,17.686667,17.680000,17.690000,17.730000
96,9.723333,9.276667,8.986667,8.453333,8.333333,7.890000,7.400000,6.786667,6.360000,6.250000,...,14.400000,14.400000,14.370000,14.363333,14.390000,14.380000,14.360000,14.380000,14.406667,14.430000
97,10.556667,10.213333,9.900000,9.436667,9.046667,8.443333,8.230000,7.946667,7.440000,7.166667,...,16.356667,16.350000,16.320000,16.350000,16.383333,16.393333,16.390000,16.366667,16.356667,16.413333


-----

##### While I could export the data with outliers transformed, the random forest approach is not as sensitive to outliers so I chose to leave them for the modeling stage

-----

In [36]:
#While we could use the methods with outliers filtered out, the analysis, random forest, is robust to outliers
#therefor
reflectances.to_csv('../../data/input/reflectances_cleaned.csv', index=False)