In [92]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os 
import glob
import re
import toolz.curried as tl

In [6]:
DATA_FILES = glob.glob(os.path.join(os.getcwd(), "data", "original_data", "*.xls*"))

In [24]:
print(*map(os.path.basename, DATA_FILES))

Madison_Rainfed_2.xls Limestone_Complete_Sorted.xlsm Houston_Complete_Sorted.xlsm Madison_Irrigated_2.xls


Quickly looking through the data in excel we can see that madison is separated into two files for rainfed and irrigated where as houston and limestone have a column for a flag to tell if it is rainfed or irrigated. First step is to get all the files to have similar formats so I will aggregate the two madison files into one.

In [51]:
madison_rf_df = pd.read_excel(DATA_FILES[0])
limestone_df = pd.read_excel(DATA_FILES[1])
houston_df = pd.read_excel(DATA_FILES[2])
madison_irr_df = pd.read_excel(DATA_FILES[3])

In [41]:
rf_cols = sorted(madison_rf_df.columns)
irr_cols = sorted(madison_irr_df.columns)
for i in range(len(rf_cols)):
    if rf_cols[i] != irr_cols[i]:
        print(f"index {i}\n{rf_cols[i]} != {irr_cols[i]}")

In [102]:
madison_rf_df["rf_irr"] = 0
madison_irr_df["rf_irr"] = 1

In [103]:
madison_rf_df.head()

Unnamed: 0,Longitude,Latitude,MOD13Q1_006__250m_16_days_EVI_doy2011081_aid0001_tif,MOD13Q1_006__250m_16_days_EVI_doy2012081_aid0001_tif,MOD13Q1_006__250m_16_days_EVI_doy2011097_aid0001_tif,MOD13Q1_006__250m_16_days_EVI_doy2011113_aid0001_tif,MOD13Q1_006__250m_16_days_EVI_doy2011129_aid0001_tif,MOD13Q1_006__250m_16_days_EVI_doy2011145_aid0001_tif,MOD13Q1_006__250m_16_days_EVI_doy2011161_aid0001_tif,MOD13Q1_006__250m_16_days_EVI_doy2011177_aid0001_tif,...,MOD13Q1_006__250m_16_days_EVI_doy2012113_aid0001_tif,MOD13Q1_006__250m_16_days_EVI_doy2012129_aid0001_tif,MOD13Q1_006__250m_16_days_EVI_doy2012145_aid0001_tif,MOD13Q1_006__250m_16_days_EVI_doy2012161_aid0001_tif,MOD13Q1_006__250m_16_days_EVI_doy2012177_aid0001_tif,MOD13Q1_006__250m_16_days_EVI_doy2012193_aid0001_tif,MOD13Q1_006__250m_16_days_EVI_doy2012209_aid0001_tif,MOD13Q1_006__250m_16_days_EVI_doy2012225_aid0001_tif,MOD13Q1_006__250m_16_days_EVI_doy2012241_aid0001_tif,rf_irr
0,-86.779999,34.990601,3413,3747,4421,3276,2311,2988,2734,4211,...,3416,2782,4794,5903,4964,5358.0,5287.0,3766,3680.0,0
1,-86.777496,34.990601,3281,3747,4421,3276,2445,3072,3030,4211,...,3400,3202,5065,4745,4172,4764.0,4815.0,3797,3916.0,0
2,-86.764801,34.990601,4045,4696,4981,4962,3469,4083,3838,5155,...,4230,3126,4240,5217,3612,5986.0,5905.0,4957,3974.0,0
3,-86.757202,34.990601,4167,5114,5000,3959,4367,3958,3031,3781,...,4808,4320,5187,4801,4535,5090.0,4958.0,4200,3011.0,0
4,-86.7546,34.990601,4256,5114,5000,3822,3801,2854,2302,3863,...,3525,3904,5480,5388,4324,4456.0,4866.0,3261,3443.0,0


In [104]:
madison_df = pd.concat([madison_rf_df, madison_irr_df])
madison_df["county"] = "Madison"

In [105]:
madison_rf_df.info()
madison_irr_df.info()
madison_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5053 entries, 0 to 5052
Data columns (total 25 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   Longitude                                             5053 non-null   float64
 1   Latitude                                              5053 non-null   float64
 2   MOD13Q1_006__250m_16_days_EVI_doy2011081_aid0001_tif  5053 non-null   int64  
 3   MOD13Q1_006__250m_16_days_EVI_doy2012081_aid0001_tif  5053 non-null   int64  
 4   MOD13Q1_006__250m_16_days_EVI_doy2011097_aid0001_tif  5053 non-null   int64  
 5   MOD13Q1_006__250m_16_days_EVI_doy2011113_aid0001_tif  5053 non-null   int64  
 6   MOD13Q1_006__250m_16_days_EVI_doy2011129_aid0001_tif  5053 non-null   int64  
 7   MOD13Q1_006__250m_16_days_EVI_doy2011145_aid0001_tif  5053 non-null   int64  
 8   MOD13Q1_006__250m_16_days_EVI_doy2011161_aid0001_tif  5053

In [106]:
madison_df.head()

Unnamed: 0,Longitude,Latitude,MOD13Q1_006__250m_16_days_EVI_doy2011081_aid0001_tif,MOD13Q1_006__250m_16_days_EVI_doy2012081_aid0001_tif,MOD13Q1_006__250m_16_days_EVI_doy2011097_aid0001_tif,MOD13Q1_006__250m_16_days_EVI_doy2011113_aid0001_tif,MOD13Q1_006__250m_16_days_EVI_doy2011129_aid0001_tif,MOD13Q1_006__250m_16_days_EVI_doy2011145_aid0001_tif,MOD13Q1_006__250m_16_days_EVI_doy2011161_aid0001_tif,MOD13Q1_006__250m_16_days_EVI_doy2011177_aid0001_tif,...,MOD13Q1_006__250m_16_days_EVI_doy2012129_aid0001_tif,MOD13Q1_006__250m_16_days_EVI_doy2012145_aid0001_tif,MOD13Q1_006__250m_16_days_EVI_doy2012161_aid0001_tif,MOD13Q1_006__250m_16_days_EVI_doy2012177_aid0001_tif,MOD13Q1_006__250m_16_days_EVI_doy2012193_aid0001_tif,MOD13Q1_006__250m_16_days_EVI_doy2012209_aid0001_tif,MOD13Q1_006__250m_16_days_EVI_doy2012225_aid0001_tif,MOD13Q1_006__250m_16_days_EVI_doy2012241_aid0001_tif,rf_irr,county
0,-86.779999,34.990601,3413,3747,4421,3276,2311,2988,2734,4211,...,2782,4794,5903,4964,5358.0,5287.0,3766,3680.0,0,Madison
1,-86.777496,34.990601,3281,3747,4421,3276,2445,3072,3030,4211,...,3202,5065,4745,4172,4764.0,4815.0,3797,3916.0,0,Madison
2,-86.764801,34.990601,4045,4696,4981,4962,3469,4083,3838,5155,...,3126,4240,5217,3612,5986.0,5905.0,4957,3974.0,0,Madison
3,-86.757202,34.990601,4167,5114,5000,3959,4367,3958,3031,3781,...,4320,5187,4801,4535,5090.0,4958.0,4200,3011.0,0,Madison
4,-86.7546,34.990601,4256,5114,5000,3822,3801,2854,2302,3863,...,3904,5480,5388,4324,4456.0,4866.0,3261,3443.0,0,Madison


In [64]:
houston_df.head()

Unnamed: 0,X,Y,Latitude,Longitude,rf_irr,EVI_2011_Apr_7_Houston_tif_rf_1,EVI_2011_Apr_23_Houston_tif_rf,EVI_2011_May_9_Houston_tif_rf,EVI_2011_May_25_Houston_tif_rf,EVI_2011_Jun_10_Houston_tif_rf,...,EVI_2012_Apr_22_Houston_tif_rf,EVI_2012_May_8_Houston_tif_rf,EVI_2012_May_24_Houston_tif_rf,EVI_2012_Jun_9_Houston_tif_rf,EVI_2012_Jun_25_Houston_tif_rf,EVI_2012_Jul_11_Houston_tif_rf,EVI_2012_Jul_27_Houston_tif_rf,EVI_2012_Aug_12_Houston_tif_rf,EVI_2012_Aug_28_Houston_tif_rf,county
0,651082,3465560.0,31.314623,-85.412285,0.0,2800,3114,1873,2989,3299,...,3417,2958,2656,4659,3069,5409,5876,5800,5738,Houston
1,653630,3465560.0,31.314289,-85.385513,0.0,1530,1453,1416,1364,1703,...,1418,1523,1683,2726,2976,4179,4016,5509,4558,Houston
2,653862,3465560.0,31.314259,-85.383079,0.0,2076,1500,1602,2063,1703,...,1754,1960,1926,2726,2976,4364,4848,4623,5547,Houston
3,651082,3465330.0,31.312532,-85.412315,0.0,2029,1400,1877,1343,1788,...,1678,2599,2452,2149,2710,3564,5173,6491,6901,Houston
4,652009,3465330.0,31.312412,-85.40258,0.0,1854,2119,1611,1549,2393,...,1861,1993,2278,4305,4462,4609,5122,5654,5367,Houston


In [65]:
houston_df = houston_df.rename(columns = {"RF_Irr":"rf_irr", "Unnamed: 2":"Latitude", "Unnamed: 3": "Longitude"})
houston_df["county"] = "Houston"

In [67]:
houston_df.head()

Unnamed: 0,X,Y,Latitude,Longitude,rf_irr,EVI_2011_Apr_7_Houston_tif_rf_1,EVI_2011_Apr_23_Houston_tif_rf,EVI_2011_May_9_Houston_tif_rf,EVI_2011_May_25_Houston_tif_rf,EVI_2011_Jun_10_Houston_tif_rf,...,EVI_2012_Apr_22_Houston_tif_rf,EVI_2012_May_8_Houston_tif_rf,EVI_2012_May_24_Houston_tif_rf,EVI_2012_Jun_9_Houston_tif_rf,EVI_2012_Jun_25_Houston_tif_rf,EVI_2012_Jul_11_Houston_tif_rf,EVI_2012_Jul_27_Houston_tif_rf,EVI_2012_Aug_12_Houston_tif_rf,EVI_2012_Aug_28_Houston_tif_rf,county
0,651082,3465560.0,31.314623,-85.412285,0.0,2800,3114,1873,2989,3299,...,3417,2958,2656,4659,3069,5409,5876,5800,5738,Houston
1,653630,3465560.0,31.314289,-85.385513,0.0,1530,1453,1416,1364,1703,...,1418,1523,1683,2726,2976,4179,4016,5509,4558,Houston
2,653862,3465560.0,31.314259,-85.383079,0.0,2076,1500,1602,2063,1703,...,1754,1960,1926,2726,2976,4364,4848,4623,5547,Houston
3,651082,3465330.0,31.312532,-85.412315,0.0,2029,1400,1877,1343,1788,...,1678,2599,2452,2149,2710,3564,5173,6491,6901,Houston
4,652009,3465330.0,31.312412,-85.40258,0.0,1854,2119,1611,1549,2393,...,1861,1993,2278,4305,4462,4609,5122,5654,5367,Houston


In [69]:
limestone_df.head()

Unnamed: 0,X,Y,Latitude,Longitude,Irr_Rf,EVI_2011_APR_7_1,EVI_2011_APR_23_,EVI_2011_MAY_9_L,EVI_2011_MAY_25_,EVI_2011_JUN_10_,...,EVI_2012_APR_6_L,EVI_2012_APR_22_,EVI_2012_MAY_8_L,EVI_2012_MAY_24_,EVI_2012_JUN_9_L,EVI_2012_JUN_25_,EVI_2012_JUL_11_,EVI_2012_JUL_27_,EVI_2012_AUG_12_,EVI_2012_AUG_28_
0,481294,3869120.0,34.964485,-87.204903,0.0,3703,4247,4079,4472,4666,...,5079,4006,2625,3389,3895,3660,6504,7270,6466,6445
1,497047,3869120.0,34.964657,-87.032349,0.0,3256,2695,2804,5184,3607,...,2830,3187,3997,4780,4882,3973,4344,4928,3651,6758
2,497510,3869120.0,34.964657,-87.027275,0.0,2983,3032,5763,2620,4831,...,2643,3491,3357,4701,4137,3710,4007,4268,4408,2721
3,480831,3868890.0,34.962391,-87.209976,0.0,3253,4592,4797,5969,3677,...,3875,4541,3599,4573,4612,4019,5942,5880,6815,5641
4,497278,3868890.0,34.962566,-87.029808,0.0,3483,3010,4030,2521,2799,...,3308,3166,3811,5268,4506,3898,6482,4144,3887,2598


In [72]:
limestone_df = limestone_df.rename(columns={"Irr_Rf":"rf_irr"})
limestone_df.head()

Unnamed: 0,X,Y,Latitude,Longitude,rf_irr,EVI_2011_APR_7_1,EVI_2011_APR_23_,EVI_2011_MAY_9_L,EVI_2011_MAY_25_,EVI_2011_JUN_10_,...,EVI_2012_APR_6_L,EVI_2012_APR_22_,EVI_2012_MAY_8_L,EVI_2012_MAY_24_,EVI_2012_JUN_9_L,EVI_2012_JUN_25_,EVI_2012_JUL_11_,EVI_2012_JUL_27_,EVI_2012_AUG_12_,EVI_2012_AUG_28_
0,481294,3869120.0,34.964485,-87.204903,0.0,3703,4247,4079,4472,4666,...,5079,4006,2625,3389,3895,3660,6504,7270,6466,6445
1,497047,3869120.0,34.964657,-87.032349,0.0,3256,2695,2804,5184,3607,...,2830,3187,3997,4780,4882,3973,4344,4928,3651,6758
2,497510,3869120.0,34.964657,-87.027275,0.0,2983,3032,5763,2620,4831,...,2643,3491,3357,4701,4137,3710,4007,4268,4408,2721
3,480831,3868890.0,34.962391,-87.209976,0.0,3253,4592,4797,5969,3677,...,3875,4541,3599,4573,4612,4019,5942,5880,6815,5641
4,497278,3868890.0,34.962566,-87.029808,0.0,3483,3010,4030,2521,2799,...,3308,3166,3811,5268,4506,3898,6482,4144,3887,2598


In [110]:
@tl.curry
def tokenize(delimiter, string):
    return string.split(delimiter)
tokenizer = tokenize("_")
tokenized = list(tl.map(tokenizer, limestone_df.columns))
for tk in tokenized:
    print(tk)

['X']
['Y']
['Latitude']
['Longitude']
['rf', 'irr']
['EVI', '2011', 'APR', '7', '1']
['EVI', '2011', 'APR', '23', '']
['EVI', '2011', 'MAY', '9', 'L']
['EVI', '2011', 'MAY', '25', '']
['EVI', '2011', 'JUN', '10', '']
['EVI', '2011', 'JUN', '26', '']
['EVI', '2011', 'JUL', '12', '']
['EVI', '2011', 'JUL', '28', '']
['EVI', '2011', 'AUG', '13', '']
['EVI', '2011', 'AUG', '29', '']
['EVI', '2012', 'APR', '6', 'L']
['EVI', '2012', 'APR', '22', '']
['EVI', '2012', 'MAY', '8', 'L']
['EVI', '2012', 'MAY', '24', '']
['EVI', '2012', 'JUN', '9', 'L']
['EVI', '2012', 'JUN', '25', '']
['EVI', '2012', 'JUL', '11', '']
['EVI', '2012', 'JUL', '27', '']
['EVI', '2012', 'AUG', '12', '']
['EVI', '2012', 'AUG', '28', '']
