## Reading and merging drought dataset
---

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px


In [7]:
# ! pip install geopandas==0.3.0
# ! pip install pyshp==1.2.10
# ! pip install shapely==1.6.3

### drought states dataset

#### merged 
---

In [11]:
# dataset of drought by population, area, and DSCI
populat_grought = pd.read_csv('../data/drought/population.csv')
area_drought = pd.read_csv('../data/drought/areas.csv')
values_drought = pd.read_csv('../data/drought/values_DSCI.csv')

In [12]:
# sort the values 
cities=values_drought['Name'].unique() # create a list with the cities
dfr=pd.DataFrame() # empty dataframe

#for loop to sort the time
for i in cities:
    e=values_drought[values_drought['Name']==i]
    dfr=dfr.append(e.iloc[::-1])

#reset index 
dfr.reset_index(inplace=True)
dfr.drop('index',axis=1,inplace=True)


In [13]:
#changing names columns to merge them
populat_grought['population_none']=populat_grought['None']
populat_grought['population_d0']=populat_grought['D0']
populat_grought['population_d1']=populat_grought['D1']
populat_grought['population_d2']=populat_grought['D2']
populat_grought['population_d3']=populat_grought['D3']
populat_grought['population_d4']=populat_grought['D4']

#droping columns duplicated 
populat_grought.drop(populat_grought.iloc[:, :11], inplace = True, axis = 1)

In [14]:
# Merging the 3 dataframes prueba

drought_state = pd.concat([area_drought, populat_grought,dfr], axis=1)
print(drought_state.shape)

(56888, 20)


In [15]:
#Change the numbers on float

drought_state['None'] = drought_state['None'].str.replace(",", "").astype(float)
drought_state['D0'] = drought_state['D0'].str.replace(",", "").astype(float)
drought_state['D1'] = drought_state['D1'].str.replace(",", "").astype(float)
drought_state['D2'] = drought_state['D2'].str.replace(",", "").astype(float)
drought_state['D3'] = drought_state['D3'].str.replace(",", "").astype(float)
drought_state['D4'] = drought_state['D4'].str.replace(",", "").astype(float)

drought_state['population_none'] = drought_state['population_none'].str.replace(",", "").astype(float)
drought_state['population_d0'] = drought_state['population_d0'].str.replace(",", "").astype(float)
drought_state['population_d1'] = drought_state['population_d1'].str.replace(",", "").astype(float)
drought_state['population_d2'] = drought_state['population_d2'].str.replace(",", "").astype(float)
drought_state['population_d3'] = drought_state['population_d3'].str.replace(",", "").astype(float)
drought_state['population_d4'] = drought_state['population_d4'].str.replace(",", "").astype(float)

In [16]:
#changing the format of the time
drought_state['ValidStart']= pd.to_datetime(drought_state['ValidStart'])
drought_state['ValidStart'] = drought_state['ValidStart'].dt.strftime('%m/%d/%Y')

drought_state['ValidEnd']= pd.to_datetime(drought_state['ValidEnd'])
drought_state['ValidEnd'] = drought_state['ValidEnd'].dt.strftime('%m/%d/%Y')

drought_state['ValidStart']=drought_state['ValidStart'].astype('datetime64[ns]')
drought_state['ValidEnd']=drought_state['ValidEnd'].astype('datetime64[ns]')

In [17]:
#drop unnecessary columns
drought_state.drop(columns=['MapDate','StatisticFormatID'],axis=1,inplace=True)

In [18]:
# rename the columns

drought_state.rename(columns={'StateAbbreviation':'state','None':'area_none','D0':'area_d0','D1':'area_d1','D2':'area_d2','D3':'area_d3','D4':'area_d4'},inplace=True)


# Organize the columns

drought_state=drought_state[['Name','state','ValidStart', 'ValidEnd','area_none', 'area_d0', 'area_d1', 'area_d2', 'area_d3','area_d4','population_none', 'population_d0',
       'population_d1', 'population_d2', 'population_d3', 'population_d4','DSCI']]

# creating new columns adding the columns total_area and total population

drought_state['total_area']=drought_state['area_none']+drought_state['area_d0']+drought_state['area_d1']+drought_state['area_d2']+drought_state['area_d3']+drought_state['area_d4']
drought_state['total_population']=drought_state['population_none']+drought_state['population_d0']+drought_state['population_d1']+drought_state['population_d2']+drought_state['population_d3']+drought_state['population_d4']

#### Data info
---

In [19]:
drought_state.head()

Unnamed: 0,Name,state,ValidStart,ValidEnd,area_none,area_d0,area_d1,area_d2,area_d3,area_d4,population_none,population_d0,population_d1,population_d2,population_d3,population_d4,DSCI,total_area,total_population
0,Alaska,AK,2021-12-07,2021-12-13,582568.29,0.0,0.0,0.0,0.0,0.0,709629.99,0.0,0.0,0.0,0.0,0.0,0,582568.29,709629.99
1,Alaska,AK,2021-11-30,2021-12-06,582568.29,0.0,0.0,0.0,0.0,0.0,709629.99,0.0,0.0,0.0,0.0,0.0,0,582568.29,709629.99
2,Alaska,AK,2021-11-23,2021-11-29,582568.29,0.0,0.0,0.0,0.0,0.0,709629.99,0.0,0.0,0.0,0.0,0.0,0,582568.29,709629.99
3,Alaska,AK,2021-11-16,2021-11-22,582568.29,0.0,0.0,0.0,0.0,0.0,709629.99,0.0,0.0,0.0,0.0,0.0,0,582568.29,709629.99
4,Alaska,AK,2021-11-09,2021-11-15,582568.29,0.0,0.0,0.0,0.0,0.0,709629.99,0.0,0.0,0.0,0.0,0.0,0,582568.29,709629.99


In [20]:
drought_state.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56888 entries, 0 to 56887
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Name              56888 non-null  object        
 1   state             56888 non-null  object        
 2   ValidStart        56888 non-null  datetime64[ns]
 3   ValidEnd          56888 non-null  datetime64[ns]
 4   area_none         56888 non-null  float64       
 5   area_d0           56888 non-null  float64       
 6   area_d1           56888 non-null  float64       
 7   area_d2           56888 non-null  float64       
 8   area_d3           56888 non-null  float64       
 9   area_d4           56888 non-null  float64       
 10  population_none   56888 non-null  float64       
 11  population_d0     56888 non-null  float64       
 12  population_d1     56888 non-null  float64       
 13  population_d2     56888 non-null  float64       
 14  population_d3     5688

In [21]:
drought_state[['DSCI','total_area','total_population']].describe()

Unnamed: 0,DSCI,total_area,total_population
count,56888.0,56888.0,56888.0
mean,75.988556,69226.00545,5989620.0
std,100.787834,85895.250749,6692471.0
min,0.0,60.2,563626.0
25%,0.0,32077.54,1761651.0
50%,29.0,54605.605,4085220.0
75%,117.0,82607.67,6591857.0
max,484.0,582568.3,37253960.0


In [22]:
drought_state.columns

Index(['Name', 'state', 'ValidStart', 'ValidEnd', 'area_none', 'area_d0',
       'area_d1', 'area_d2', 'area_d3', 'area_d4', 'population_none',
       'population_d0', 'population_d1', 'population_d2', 'population_d3',
       'population_d4', 'DSCI', 'total_area', 'total_population'],
      dtype='object')

In [23]:
drought_state.shape

(56888, 19)

In [24]:
#checking how many day have a state

drought_state['state'].value_counts()

OR    1094
WI    1094
IN    1094
KY    1094
NE    1094
RI    1094
WV    1094
AL    1094
VT    1094
CA    1094
MT    1094
NV    1094
WY    1094
CT    1094
FL    1094
MD    1094
VA    1094
SD    1094
AZ    1094
AK    1094
GA    1094
TX    1094
NY    1094
OH    1094
NM    1094
MA    1094
HI    1094
DE    1094
NJ    1094
MN    1094
TN    1094
KS    1094
CO    1094
MI    1094
MS    1094
SC    1094
WA    1094
IA    1094
PR    1094
DC    1094
NH    1094
AR    1094
MO    1094
UT    1094
OK    1094
IL    1094
ID    1094
NC    1094
ND    1094
ME    1094
PA    1094
LA    1094
Name: state, dtype: int64

# creating a Region column for 9 climate regions 

In [28]:
#function to combine states and region into dictionary by using code for ohio_valley
#needs list of states and the string(region)
def state_region_dict(list_states,string_region):
    return dict.fromkeys(list_states,string_region)

In [75]:
oh_list=['Illinois','Indiana','Kentucky','Missouri','Ohio','Tennessee','West Virginia']
oh='Ohio Valley'
ohio_valley=state_region_dict(oh_list,oh)

um_list=['Iowa', 'Michigan', 'Minnesota', 'Wisconsin']
um_region='Upper Midwest'
upper_midwest=state_region_dict(um_list,um_region)

ne_list=['Delaware','Maine','Maryland','Massachusetts', 'New Hampshire', 'New Jersey',
        'New York', 'Pennsylvania','Vermont']#,'Rhode Island''Connecticut',
ne_region='Northeast'
north_east=state_region_dict(ne_list,ne_region)

nw_list=['Idaho','Oregon','Washington']
nw_region='Northwest'
north_west=state_region_dict(nw_list,nw_region)

s_list=['Arkansas','Kansas','Louisiana','Mississippi','Oklahoma','Texas']
s_region='South'
south=state_region_dict(s_list,s_region)

se_list=['Alabama','Florida','Georgia','North Carolina','South Carolina','Virginia']
se_region='Southeast'
south_east=state_region_dict(se_list,se_region)

sw_list=['Arizona','Colorado','New Mexico','Utah']
sw_region='Southwest'
south_west=state_region_dict(sw_list,sw_region)

w_list=['California','Nevada']
w_region='West'
west=state_region_dict(w_list,w_region)

wnc_list=['Montana','Nebraska','North Dakota','South Dakota','Wyoming']
wnc_region='Northern Rockies and Plains'
northern_rockies=state_region_dict(wnc_list,wnc_region)

#https://stackoverflow.com/questions/11977730/creating-a-dictionary-with-same-values

In [76]:
#combine multiple dictionaries into one
final_dict={**ohio_valley,**upper_midwest,**north_east,**north_west,**south,
            **south_east,**south_west,**west,**northern_rockies}
final_dict
#https://towardsdatascience.com/merge-dictionaries-in-python-d4e9ce137374

{'Illinois': 'Ohio Valley',
 'Indiana': 'Ohio Valley',
 'Kentucky': 'Ohio Valley',
 'Missouri': 'Ohio Valley',
 'Ohio': 'Ohio Valley',
 'Tennessee': 'Ohio Valley',
 'West Virginia': 'Ohio Valley',
 'Iowa': 'Upper Midwest',
 'Michigan': 'Upper Midwest',
 'Minnesota': 'Upper Midwest',
 'Wisconsin': 'Upper Midwest',
 'Delaware': 'Northeast',
 'Maine': 'Northeast',
 'Maryland': 'Northeast',
 'Massachusetts': 'Northeast',
 'New Hampshire': 'Northeast',
 'New Jersey': 'Northeast',
 'New York': 'Northeast',
 'Pennsylvania': 'Northeast',
 'Vermont': 'Northeast',
 'Idaho': 'Northwest',
 'Oregon': 'Northwest',
 'Washington': 'Northwest',
 'Arkansas': 'South',
 'Kansas': 'South',
 'Louisiana': 'South',
 'Mississippi': 'South',
 'Oklahoma': 'South',
 'Texas': 'South',
 'Alabama': 'Southeast',
 'Florida': 'Southeast',
 'Georgia': 'Southeast',
 'North Carolina': 'Southeast',
 'South Carolina': 'Southeast',
 'Virginia': 'Southeast',
 'Arizona': 'Southwest',
 'Colorado': 'Southwest',
 'New Mexico': 'S

In [46]:
#print how many states are
print(len(final_dict.keys()))

46


In [48]:
#Replaced initial values with dictionary region values
drought_state['climate_regions']=drought_state['Name']
drought_state.replace({'climate_regions':final_dict},inplace=True)
#https://sparkbyexamples.com/pandas/pandas-remap-values-in-column-with-a-dictionary-dict/

In [49]:
drought_state.head()

Unnamed: 0,Name,state,ValidStart,ValidEnd,area_none,area_d0,area_d1,area_d2,area_d3,area_d4,population_none,population_d0,population_d1,population_d2,population_d3,population_d4,DSCI,total_area,total_population,climate_regions
1094,Alabama,AL,2021-12-07,2021-12-13,24009.42,26435.31,1159.61,0.0,0.0,0.0,3011792.4,1749463.44,18480.18,0.0,0.0,0.0,56,51604.34,4779736.02,Southeast
1095,Alabama,AL,2021-11-30,2021-12-06,26482.82,25121.53,0.0,0.0,0.0,0.0,3153556.89,1626179.13,0.0,0.0,0.0,0.0,49,51604.35,4779736.02,Southeast
1096,Alabama,AL,2021-11-23,2021-11-29,37734.5,13869.85,0.0,0.0,0.0,0.0,3917434.59,862301.43,0.0,0.0,0.0,0.0,27,51604.35,4779736.02,Southeast
1097,Alabama,AL,2021-11-16,2021-11-22,50440.75,1163.6,0.0,0.0,0.0,0.0,4752055.32,27680.71,0.0,0.0,0.0,0.0,2,51604.35,4779736.03,Southeast
1098,Alabama,AL,2021-11-09,2021-11-15,51604.35,0.0,0.0,0.0,0.0,0.0,4779736.02,0.0,0.0,0.0,0.0,0.0,0,51604.35,4779736.02,Southeast


In [52]:
drought_state['climate_regions'].value_counts()

Northeast                      9846
Ohio Valley                    7658
South                          6564
Southeast                      6564
Northern Rockies and Plains    5470
Southwest                      4376
Upper Midwest                  4376
Northwest                      3282
West                           2188
Name: climate_regions, dtype: int64

In [51]:
#dropping some states
drought_state=drought_state[drought_state['climate_regions'].str.contains('Alaska')==False]
drought_state=drought_state[drought_state['climate_regions'].str.contains('District of Columbia')==False]
drought_state=drought_state[drought_state['climate_regions'].str.contains('Hawaii')==False]
drought_state=drought_state[drought_state['climate_regions'].str.contains('Puerto Rico')==False]
drought_state=drought_state[drought_state['climate_regions'].str.contains('Rhode Island')==False]
drought_state=drought_state[drought_state['climate_regions'].str.contains('Connecticut')==False]


drought_state['climate_regions'].value_counts()

Northeast                      9846
Ohio Valley                    7658
South                          6564
Southeast                      6564
Northern Rockies and Plains    5470
Southwest                      4376
Upper Midwest                  4376
Northwest                      3282
West                           2188
Name: climate_regions, dtype: int64

## Creating 9 dataframes for every region

In [64]:
#function to collect the regions
def nine_regions(state_list):
    return drought_state[drought_state['Name'].isin(state_list)]

In [65]:
oh_df=nine_regions(oh_list)
um_df=nine_regions(um_list)
ne_df=nine_regions(ne_list)
nw_df=nine_regions(nw_list)
s_df=nine_regions(s_list)
se_df=nine_regions(se_list)
sw_df=nine_regions(sw_list)
w_df=nine_regions(w_list)
wnc_df=nine_regions(wnc_list)

In [89]:
#saving the dataframe
drought_state.to_csv('../data/drought/drought_state_merge.csv', index=False)
oh_df.to_csv('../data/drought/drought_ohio_valley.csv', index=False)
um_df.to_csv('../data/drought/drought_Upper_Midwest.csv', index=False)
ne_df.to_csv('../data/drought/drought_Northwest.csv', index=False)
nw_df.to_csv('../data/drought/drought_Northwest.csv', index=False)
s_df.to_csv('../data/drought/drought_South.csv', index=False)
se_df.to_csv('../data/drought/drought_Southeast.csv', index=False)
sw_df.to_csv('../data/drought/drought_Southwest.csv', index=False)
w_df.to_csv('../data/drought/drought_West.csv', index=False)
wnc_df.to_csv('../data/drought/drought_Northern_Rockies_Plains.csv', index=False)