# DS 2010 Final Project

## Import Packages

In [1]:
import pandas as pd
import math
import numpy as np
import datetime as dt

## Import Data

In [2]:
covid_19_data = pd.read_csv("Raw_Data/all-states-history-correct-range.csv")

# Starts Apr 23, Ends Nov 9
anxiety_depression_data = pd.read_csv("Raw_Data/Indicators_of_Anxiety_or_Depression_Based_on_Reported_Frequency_of_Symptoms_During_Last_7_Days.csv")

trips_data = pd.read_csv("Raw_Data/Trips_by_Distance.csv")


## Clean trip data

In [3]:
td = trips_data.groupby(by = ['Date', 'State Postal Code']).sum().drop(columns= ['State FIPS', 'County FIPS']).reset_index()
td['Period'] = np.floor(td.index / 51 / 7)
td2 = td.groupby(by = ['Period', 'State Postal Code']).sum().reset_index()

td2["Period"] = td2["Period"] - 1
td2.loc[td2["Period"] == -1,"Period"] = 0
# Index 1-11 are 1 week
# Index 12-16 are break
for i in range(12, 17):
    td2.loc[td2["Period"] == i,"Period"] = 12

# Index 17|18, 19|20, 21|22, 23|24, 25|26, 27|28 are one time period
period = 13
for i in range(0, 12, 2):
    week = i + 17
    td2.loc[td2["Period"] == week,"Period"] = period
    td2.loc[td2["Period"] == week + 1,"Period"] = period
    period += 1

td2 = td2.groupby(by= ["Period", "State Postal Code"]).sum().reset_index()
td2 = td2[td2['Period'] != 12.0]

td2['Period'] = td2.apply(lambda x:
                        x['Period']+1
                        if x['Period'] < 12 
                        else
                        x['Period'], axis=1)

trip_df = td2.copy()

trip_df['State'] = trip_df['State Postal Code']
trip_df = trip_df.drop('State Postal Code', axis=1)



In [4]:
trip_df

Unnamed: 0,Period,Population Staying at Home,Population Not Staying at Home,Number of Trips,Number of Trips <1,Number of Trips 1-3,Number of Trips 3-5,Number of Trips 5-10,Number of Trips 10-25,Number of Trips 25-50,Number of Trips 50-100,Number of Trips 100-250,Number of Trips 250-500,Number of Trips >=500,State
0,1.0,6307163.0,14287547.0,5.850097e+07,17576261.0,16495524.0,7496209.0,7903468.0,5709323.0,2317707.0,704279.0,187326.0,40494.0,70378.0,AK
1,1.0,28041066.0,108819322.0,4.022133e+08,90141352.0,101341754.0,54003092.0,64762466.0,60915550.0,20594042.0,7174644.0,2636396.0,545014.0,98992.0,AL
2,1.0,17585588.0,66801512.0,2.502319e+08,58626768.0,65947140.0,31781426.0,38396070.0,35735626.0,12471888.0,4908546.0,1917546.0,379184.0,67724.0,AR
3,1.0,56746120.0,144059968.0,5.123756e+08,146357458.0,124083598.0,58454178.0,70934148.0,74330274.0,24290644.0,8272018.0,4563512.0,755708.0,334094.0,AZ
4,1.0,317664842.0,789932418.0,2.593218e+09,688818389.0,684687597.0,296317707.0,356772431.0,357125194.0,137654612.0,48995443.0,17590719.0,3890374.0,1365278.0,CA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
964,18.0,1710704.0,4552286.0,1.891188e+07,3580023.0,4869895.0,2370965.0,2802106.0,3220018.0,1481427.0,425442.0,130816.0,25535.0,5657.0,VT
965,18.0,22115955.0,53239955.0,2.079825e+08,49296205.0,51810385.0,24247099.0,31877740.0,33102683.0,12089961.0,3757935.0,1387603.0,252533.0,160383.0,WA
966,18.0,16270289.0,41865391.0,1.694443e+08,36086818.0,44652004.0,21050210.0,25669560.0,24945273.0,9845169.0,4641455.0,2139162.0,341154.0,73481.0,WI
967,18.0,5046710.0,13011610.0,5.215250e+07,10282657.0,14230649.0,6453695.0,7797154.0,7812684.0,3254869.0,1687553.0,540366.0,81466.0,11407.0,WV


## Clean COVID-19 Data

In [5]:
# KEEP:
# DC: District of Columbia

# REMOVE:
# AS: American Samoa
# PR: Puerto Rico
# GU: Guam
# MP: Northern Mariana Islands
# VI: US Virgin Islands
remove_states = ['AS', 'PR', 'GU', 'MP', 'VI']

### prepare covid data

In [6]:
valuable_cols = ['state', 'date', 'death', 'deathConfirmed', 
                 'hospitalizedCurrently', 'positiveCasesViral', 
                 'positiveIncrease', 'totalTestsPeopleViral', 
                 'totalTestsPeopleViralIncrease', 'totalTestsViral']

In [7]:
# Remove undesired columns
covid_data = covid_19_data.copy()[valuable_cols]
# Convert string to datetime
covid_data['date'] = covid_data['date'].astype('datetime64[ns]')
# Convert target_ranges to list of 

### create Time Period column

In [8]:
ranges_list = ['4/23/2020-5/5/2020', '5/7/2020-5/12/2020', '5/14/2020-5/19/2020', '5/21/2020-5/26/2020',
               '5/28/2020-6/2/2020', '6/4/2020-6/9/2020', '6/11/2020-6/16/2020', '6/18/2020-6/23/2020',
               '6/25/2020-6/30/2020', '7/2/2020-7/7/2020', '7/9/2020-7/14/2020', '7/16/2020-7/21/2020',                  
               '8/19/2020-8/31/2020', '9/2/2020-9/14/2020', '9/16/2020-9/28/2020', '9/30/2020-10/12/2020', 
               '10/14/2020-10/26/2020', '10/28/2020-11/9/2020']

# Convert ranges_list to a list of lists of start and end datetimes
period_list = []
for r in ranges_list:
    r_list = []
    for date_str in r.split('-'):
        date = dt.datetime.strptime(date_str, '%m/%d/%Y').date()
        r_list.append(date)
    period_list.append(r_list)

# Create Time Period column
time_periods = [] # -1: before first period | -2: after last period | 0-n: respective period
for index, row in covid_data.iterrows():
    true_period = np.NaN
    if row['date'] < period_list[0][0]:
        true_period = -1
    elif row['date'] > period_list[-1][-1]:
        true_period = -2
    else:
        for period in period_list:
            if period[0] <= row['date'] <= period[1]:
                true_period = period_list.index(period) + 1
    time_periods.append(true_period)

# Add Time Period column to covid_data
covid_data['Period'] = time_periods

# Make a copy before cleaning
data = covid_data.copy()

# Remove rows with invalid time periods
data.drop(data.loc[data['Period'] == -1].index, inplace=True)
data.drop(data.loc[data['Period'] == -2].index, inplace=True)
data.drop(data.loc[data['Period'] == np.NaN].index, inplace=True)

In [9]:
states = list(set(data['state'].values))
final_states = [item for item in states if item not in remove_states]
len(final_states)

51

### Fill NaN and groupby Time Period for each state

In [10]:
clean_data = []

data["death"] = data[["death", "deathConfirmed"]].max(axis=1)
data["positiveCasesViral"] = data[["positiveCasesViral", "totalTestsViral", "totalTestsPeopleViral"]].max(axis=1)
clean_df = data.drop(columns=["deathConfirmed", "totalTestsViral", "totalTestsPeopleViral"])

for state in final_states:
    state_data = clean_df.loc[clean_df['state'] == state]
    state_data = state_data.groupby(['Period', 'state']).mean().reset_index()
    clean_data.append(state_data)
    
clean_df = pd.concat(clean_data)
clean_df = clean_df.dropna()
covid_df = clean_df.copy()

covid_df['State'] = covid_df['state']
covid_df = covid_df.drop('state', axis=1)

In [11]:
covid_df.to_csv("Clean_Data/clean_covid_data.csv", index= False)

In [12]:
covid_df

Unnamed: 0,Period,death,hospitalizedCurrently,positiveCasesViral,positiveIncrease,totalTestsPeopleViralIncrease,State
0,1.0,4300.076923,2523.153846,212768.000000,753.230769,0.000000,MI
1,2.0,5059.333333,1518.333333,324097.166667,751.666667,0.000000,MI
2,3.0,5375.500000,1185.000000,425699.000000,519.666667,0.000000,MI
3,4.0,5629.000000,970.166667,530422.833333,320.666667,0.000000,MI
4,5.0,5810.166667,755.000000,637220.833333,273.666667,0.000000,MI
...,...,...,...,...,...,...,...
13,14.0,1180.538462,304.000000,661130.461538,689.000000,4853.769231,IA
14,15.0,1283.461538,301.230769,729203.538462,825.000000,5055.692308,IA
15,16.0,1406.461538,421.307692,795515.846154,871.769231,4909.076923,IA
16,17.0,1568.615385,506.692308,844263.692308,1082.000000,3191.153846,IA


## Clean Anxiety/Depression Data

In [13]:
data = anxiety_depression_data.copy()

In [14]:
# Remove uneeded rows and columns
clean_data = data.loc[data['State'] != 'United States']
clean_data = clean_data.drop(['Phase', 'Group', 'Subgroup', 'Time Period Label',
                             'Low CI', 'High CI', 'Confidence Interval', 'Quartile range'], axis=1)

# Break each target into a dataframe
depression_data = clean_data.loc[clean_data['Indicator'] == 'Symptoms of Depressive Disorder']
anxiety_data = clean_data.loc[clean_data['Indicator'] == 'Symptoms of Anxiety Disorder']
both_data = clean_data.loc[clean_data['Indicator'] == 'Symptoms of Anxiety Disorder or Depressive Disorder']

# Merge each target into one dataframe on State and Time Period  
merged_data = pd.merge(depression_data, anxiety_data, on=['State', 'Time Period'])
merged_data = pd.merge(merged_data, both_data, on=['State', 'Time Period'])

# Clean final dataframe
merged_data = merged_data.drop(['Indicator_x', 'Indicator_y', 'Indicator'], axis=1)
merged_data.columns = ['State', 'Period', 'Depression_Score', 'Anxiety_Score', 'Mix_Score']


In [15]:
# Dictionary of states, used to change name -> acronym
states_hash ={
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Federated States Of Micronesia': 'FM',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Marshall Islands': 'MH',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands': 'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Palau': 'PW',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'}

In [16]:
# Change state name into acronym
merged_data['State'] = merged_data.apply(lambda x: states_hash[x['State']], axis=1)

In [17]:
label_df = merged_data.copy()

In [18]:
label_df.to_csv("Clean_Data/clean_label_data.csv", index= False)

# Merge datasets

In [19]:
# covid_df.join(trip_df, on=['Period', 'State'], how='left')
final_df = pd.merge(label_df, covid_df, on=['Period', 'State'])
final_df = pd.merge(final_df, trip_df, on=['Period', 'State'])


In [20]:
columns = ['State', 'Period', 'Depression_Score', 'Anxiety_Score', 'Mix_Score',
       'death', 'hospitalizedCurrently', 'positiveCasesViral',
       'positiveIncrease', 'totalTestsPeopleViralIncrease',
       'Population Staying at Home', 'Population Not Staying at Home',
       'Number of Trips', 'Number of Trips <1', 'Number of Trips 1-3',
       'Number of Trips 3-5', 'Number of Trips 5-10', 'Number of Trips 10-25',
       'Number of Trips 25-50', 'Number of Trips 50-100',
       'Number of Trips 100-250', 'Number of Trips 250-500',
       'Number of Trips >=500']

In [21]:
final_df = final_df[columns]

In [22]:
final_df

Unnamed: 0,State,Period,Depression_Score,Anxiety_Score,Mix_Score,death,hospitalizedCurrently,positiveCasesViral,positiveIncrease,totalTestsPeopleViralIncrease,...,Number of Trips <1,Number of Trips 1-3,Number of Trips 3-5,Number of Trips 5-10,Number of Trips 10-25,Number of Trips 25-50,Number of Trips 50-100,Number of Trips 100-250,Number of Trips 250-500,Number of Trips >=500
0,AL,1,18.6,25.6,30.3,251.230769,452.615385,8.109423e+04,216.923077,0.000000,...,90141352.0,101341754.0,54003092.0,64762466.0,60915550.0,20594042.0,7174644.0,2636396.0,545014.0,98992.0
1,AK,1,19.2,27.7,31.5,9.000000,19.615385,1.812700e+04,2.769231,0.000000,...,17576261.0,16495524.0,7496209.0,7903468.0,5709323.0,2317707.0,704279.0,187326.0,40494.0,70378.0
2,AZ,1,22.4,32.3,36.4,311.153846,707.000000,7.144754e+04,295.846154,2435.307692,...,146357458.0,124083598.0,58454178.0,70934148.0,74330274.0,24290644.0,8272018.0,4563512.0,755708.0,334094.0
3,AR,1,26.6,33.7,38.0,60.384615,98.538462,4.483623e+04,93.846154,0.000000,...,58626768.0,65947140.0,31781426.0,38396070.0,35735626.0,12471888.0,4908546.0,1917546.0,379184.0,67724.0
4,CA,1,25.4,30.8,37.4,1911.923077,4833.615385,6.116494e+05,1601.230769,0.000000,...,688818389.0,684687597.0,296317707.0,356772431.0,357125194.0,137654612.0,48995443.0,17590719.0,3890374.0,1365278.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
866,VA,18,24.0,30.5,36.7,3669.153846,1057.769231,1.706804e+05,1389.846154,0.000000,...,63938981.0,65908481.0,33943285.0,41564237.0,41468061.0,14788220.0,6389743.0,2730127.0,389428.0,117265.0
867,WA,18,24.4,34.7,38.8,2391.461538,378.384615,1.098226e+05,1063.923077,0.000000,...,49296205.0,51810385.0,24247099.0,31877740.0,33102683.0,12089961.0,3757935.0,1387603.0,252533.0,160383.0
868,WV,18,25.7,35.7,39.3,472.615385,261.538462,8.004042e+05,469.153846,0.000000,...,10282657.0,14230649.0,6453695.0,7797154.0,7812684.0,3254869.0,1687553.0,540366.0,81466.0,11407.0
869,WI,18,27.0,36.0,39.2,2190.384615,1665.769231,2.101263e+06,5303.923077,15601.461538,...,36086818.0,44652004.0,21050210.0,25669560.0,24945273.0,9845169.0,4641455.0,2139162.0,341154.0,73481.0


In [23]:
final_df.to_csv("Clean_Data/final_data.csv", index= False)