# **This file performs below operations** -

***s1: EVI and LSWI Modeling - timeframe (2000-2022)***
```
1. Load multiple excel files in the study-1
'EVI and LSWI.xlsx' - 8-day observations - need to interpolate to daily
'weather_variables.xlsx' - daily observations
2. Perform required date formating one each dataset
3. Merge weather and satellite data
4. Perform interpolation to make satellite date for daily observations
5. Save corresponding combined dataframes to pickle files
```






In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
from os import walk
from time import sleep
from tqdm import tqdm
import pickle
import string
import numpy as np
import pandas as pd
from scipy import stats
from scipy import interpolate
import plotly.express as px
from plotly.offline import iplot
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [None]:
gopi_base_path = '/content/drive/MyDrive/Colab Notebooks/DISC OU/DS for Ag - Alfalfa/Fall 2023'
Adam_base_path = '/content/drive/MyDrive/DSA Project'
raj_path = '/content/drive/MyDrive/DS for Ag - Alfalfa/Fall 2023'
base_path = raj_path # just change this line
s1_data_path = os.path.join(base_path, 'Data','EVI and LSWI')
s2_data_path = os.path.join(base_path, 'Data','ER')

In [None]:


## Reading filenames for the two studies - s1:EVI and LSWI; s2: ER
# filenames = next(walk(s1_data_path), (None, None, []))[2]
# filenames

s1_filenames = ['weather_data.xlsx','P16_EVI_LSWI.xlsx', 'P14_EVI_LSWI.xlsx', 'P13_EVI_LSWI.xlsx']
s2_filenames = ['P20_ER.xlsx']

print(f's1_filenames: {s1_filenames} \ns2_filenames: {s2_filenames}')

s1_filenames: ['weather_data.xlsx', 'P16_EVI_LSWI.xlsx', 'P14_EVI_LSWI.xlsx', 'P13_EVI_LSWI.xlsx'] 
s2_filenames: ['P20_ER.xlsx']


In [None]:
def print_head(df, length):
  print(f'{df.name}: ')
  print(df.head(length))

def read_format_date(base_path, filename, length, print_flg):
  file_path = os.path.join(base_path, filename)
  df  = pd.read_excel(file_path)
  file_str = filename.split('.',1)[0]

  if 'EVI_LSWI' in filename:
    df['Date'] = pd.to_datetime(df['Date']).dt.date
    df = df[['Date', 'EVI', 'LSWI']]
    df.set_index('Date', inplace=True)
  elif 'weather_data' in filename:
    df['Date'] = pd.to_datetime(df[['YEAR', 'MONTH', 'DAY']]).dt.date
    df.drop(['YEAR', 'MONTH', 'DAY'], axis=1, inplace = True)
    df = df[['Date', 'TMAX',	'TMIN',	'TAVG',	'HAVG',	'VDEF',	'HDEG',	'CDEG',
             'WCMN',	'WSPD',	'ATOT',	'RAIN',	'SAVG',	'BAVG',	'TR05',	'TR25',	'TR60']]
    df.set_index('Date', inplace=True)
  elif 'P20_ER' in filename:
    df['Timestamp'] = pd.to_datetime(df['Year'].astype(str) + df['DoY'].astype(str), format='%Y%j') + pd.to_timedelta(df['Hour'], unit='h')
    df.drop(['Year', 'DoY', 'Hour'], axis=1, inplace = True)
    # df['Date'] = pd.to_datetime(df['Timestamp'])
    # df.drop('Timestamp', axis=1, inplace = True)
    df = df[['Timestamp', 'Rg',	'Tair',	'Tsoil',	'rH',	'VPD','Ustar',	'SWC',	'GPP',	'NEE',	'ET',	'ER']]
    # df = df.groupby(df.Date.dt.date).mean() # take daily average for SWC
    df.set_index('Date', inplace=True)
  else:
    print(f'Invalid file name: {filename}')

  df.name = file_str
  if print_flg:
    print_head(df,length)
  return df

In [None]:
# check the start and end date for each df
def get_start_end_dates(df ):
  date_range = str(df.index.min()) + ' to ' +str(df.index.max())
  print(f'{df.name}: {date_range}')
  # print(df.dtypes)

In [None]:
# get_start_end_dates(weather_df)
#get_start_end_dates(p13_evi_lswi_df)
#get_start_end_dates(p14_evi_lswi_df)
#get_start_end_dates(p16_evi_lswi_df)

In [None]:
def data_interpolation(df, mtd):
  if mtd == 'polynomial':
    df.index = pd.to_datetime(df.index) # convert to datetime index
    df['EVI'] = df['EVI'].interpolate(method="polynomial", order=2).round(4)
    df['LSWI'] = df['LSWI'].interpolate(method="polynomial", order=2).round(4)
  else:
    df['EVI'] = df['EVI'].interpolate(method="linear").round(4)
    df['LSWI'] = df['LSWI'].interpolate(method="linear").round(4)

  df = df.round(4)
  return df

In [None]:
def merge_and_interpolate(weather, sat, mtd):
  # concat weather and satellite data
  concat_df = pd.concat([weather,sat], axis=1)
  # drop rows outsid the range of satellite data
  concat_df = concat_df[(concat_df.index >= sat.index.min()) &
              (concat_df.index <= sat.index.max())]
  # perform data interpolation
  final_df = data_interpolation(concat_df, mtd)
  return final_df

In [None]:
def save_pickle(df, base_dir, filename):
  # create pickle dir if not exists
  pickle_dir = os.path.join(base_dir, 'pickle files')
  if os.path.exists(pickle_dir) == False:
    os.mkdir(pickle_dir)

  # file_name = 'p13_final_df.pkl'
  file_path = os.path.join(pickle_dir, filename)

  # save/dump to pickle
  with open(file_path, 'wb') as handle:
      pickle.dump(df, handle, protocol=pickle.HIGHEST_PROTOCOL)

  print(f'successfully saved {filename} to a pickle file at {pickle_dir}')


In [None]:
# Load and format datasets
print_len=10
print_flg = True
weather_df = read_format_date(s1_data_path, s1_filenames[0],print_len, print_flg)
p16_evi_lswi_df = read_format_date(s1_data_path, s1_filenames[1],print_len, print_flg)
p14_evi_lswi_df= read_format_date(s1_data_path, s1_filenames[2],print_len, print_flg)
p13_evi_lswi_df= read_format_date(s1_data_path, s1_filenames[3],print_len, print_flg)
# Merge and Interpolate data
p13_final_df = merge_and_interpolate(weather_df, p13_evi_lswi_df, 'polynomial')
p14_final_df = merge_and_interpolate(weather_df, p14_evi_lswi_df, 'polynomial')
p16_final_df = merge_and_interpolate(weather_df, p16_evi_lswi_df, 'polynomial')
# save final df's with weather and sat readings to a pickle file
save_pickle(p13_final_df, s1_data_path, 'p13_final_df')
save_pickle(p14_final_df, s1_data_path, 'p14_final_df')
save_pickle(p16_final_df, s1_data_path, 'p16_final_df')

weather_data: 
             TMAX   TMIN   TAVG   HAVG  VDEF   HDEG  CDEG   WCMN   WSPD  \
Date                                                                      
2000-01-01  69.04  38.58  53.69  60.74  6.76  11.19   0.0  31.63  15.82   
2000-01-02  60.58  31.12  48.24  58.04  5.47  19.15   0.0  25.76   6.21   
2000-01-03  55.06  26.92  40.14  86.71  1.39  24.01   0.0  12.03  13.38   
2000-01-04  44.77  17.64  29.75  64.11  2.28  33.80   0.0   6.09  11.32   
2000-01-05  53.69  26.63  38.42  48.98  4.76  24.84   0.0  15.09  15.63   
2000-01-06  53.60  22.12  35.21  72.81  2.59  27.14   0.0  13.88   8.24   
2000-01-07  54.63  20.32  37.34  74.73  2.79  27.53   0.0  12.80   6.10   
2000-01-08  51.22  39.95  46.06  95.26  0.55  19.41   0.0  33.15  11.96   
2000-01-09  53.76  30.67  40.52  72.60  2.87  22.78   0.0  23.28   9.52   
2000-01-10  64.90  31.75  45.12  47.59  6.31  16.67   0.0  24.01  10.57   

             ATOT  RAIN   SAVG   BAVG    TR05    TR25    TR60  
Date                

In [None]:
p13_final_df

Unnamed: 0_level_0,TMAX,TMIN,TAVG,HAVG,VDEF,HDEG,CDEG,WCMN,WSPD,ATOT,RAIN,SAVG,BAVG,TR05,TR25,TR60,EVI,LSWI
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2000-02-18,64.42,33.20,41.05,76.88,1.95,16.19,0.0,23.75,16.76,6.46,0.00,47.77,47.91,1.9138,1.5788,1.5071,0.2540,-0.1110
2000-02-19,50.34,27.70,36.51,75.42,2.18,25.98,0.0,22.60,5.27,15.68,0.00,45.89,45.79,1.9507,1.5793,1.5031,0.2436,-0.1204
2000-02-20,62.10,23.98,45.21,64.41,5.32,21.96,0.0,20.70,8.83,15.87,0.00,45.68,45.75,2.0070,1.5821,1.5054,0.2346,-0.1281
2000-02-21,69.71,46.28,58.62,61.23,6.77,7.00,0.0,41.75,16.38,13.73,0.00,49.57,50.79,2.0394,1.5840,1.5050,0.2270,-0.1339
2000-02-22,68.67,52.00,59.82,91.37,1.70,4.67,0.0,-996.00,19.15,4.33,0.48,52.96,54.28,2.0139,1.5848,1.5052,0.2209,-0.1379
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-23,62.96,35.38,48.06,75.90,3.46,15.83,0.0,28.77,11.00,9.35,0.01,48.02,47.67,2.3404,1.9822,2.5208,0.2145,-0.0925
2021-12-24,78.84,50.56,65.62,38.61,14.70,0.30,0.0,-996.00,18.00,7.44,0.00,49.79,51.71,2.3299,1.9822,2.5161,0.2287,-0.0706
2021-12-25,71.31,45.97,58.67,48.67,9.56,6.36,0.0,41.55,7.34,9.60,0.00,51.51,55.04,2.3254,1.9832,2.5116,0.2484,-0.0417
2021-12-26,76.24,38.72,59.83,61.17,8.73,7.52,0.0,34.80,12.92,6.94,0.00,51.31,53.17,2.3360,1.9868,2.5046,0.2735,-0.0059


In [None]:
main_path = '/content/drive/MyDrive/DS for Ag - Alfalfa/Fall 2023'
data_path = os.path.join(main_path, 'Data')
code_path = os.path.join(main_path, 'Code')

In [None]:
pickle_dir = os.path.join(data_path, 'pickle files')
# save the combined alfalfa data to a pickle file
pickle_dir = os.path.join(data_path, 'pickle files')
if os.path.exists(pickle_dir) == False:
  os.mkdir(pickle_dir)

In [None]:
file_name = 'p13_final_df.pkl'
file_path = os.path.join(pickle_dir, file_name)
# save/dump to pickle
with open(file_path, 'wb') as handle:
    pickle.dump(p13_final_df, handle, protocol=pickle.HIGHEST_PROTOCOL)