# DS 2010 Final Project

## Import Packages

In [1]:
import pandas as pd
import math
import numpy as np
import datetime as dt

## Import Data

In [2]:
covid_19_data = pd.read_csv("Raw_Data/all-states-history-correct-range.csv")

# Starts Apr 23, Ends Nov 9
anxiety_depression_data = pd.read_csv("Raw_Data/Indicators_of_Anxiety_or_Depression_Based_on_Reported_Frequency_of_Symptoms_During_Last_7_Days.csv")

trips_data = pd.read_csv("Raw_Data/Trips_by_Distance.csv")


## Clean trip data

In [3]:
td = trips_data.groupby(by = ['Date', 'State Postal Code']).sum().drop(columns= ['State FIPS', 'County FIPS']).reset_index()
td['Period'] = np.floor(td.index / 51 / 7)
td2 = td.groupby(by = ['Period', 'State Postal Code']).sum().reset_index()

td2["Period"] = td2["Period"] - 1
td2.loc[td2["Period"] == -1,"Period"] = 0
# Index 1-11 are 1 week
# Index 12-16 are break
for i in range(12, 17):
    td2.loc[td2["Period"] == i,"Period"] = 12

# Index 17|18, 19|20, 21|22, 23|24, 25|26, 27|28 are one time period
period = 13
for i in range(0, 12, 2):
    week = i + 17
    td2.loc[td2["Period"] == week,"Period"] = period
    td2.loc[td2["Period"] == week + 1,"Period"] = period
    period += 1

td2 = td2.groupby(by= ["Period", "State Postal Code"]).sum().reset_index()
td2 = td2[td2['Period'] != 12.0]

td2['Period'] = td2.apply(lambda x:
                        x['Period']+1
                        if x['Period'] < 12 
                        else
                        x['Period'], axis=1)

trip_df = td2.copy()

trip_df['State'] = trip_df['State Postal Code']
trip_df = trip_df.drop('State Postal Code', axis=1)



In [4]:
trip_df

Unnamed: 0,Period,Population Staying at Home,Population Not Staying at Home,Number of Trips,Number of Trips <1,Number of Trips 1-3,Number of Trips 3-5,Number of Trips 5-10,Number of Trips 10-25,Number of Trips 25-50,Number of Trips 50-100,Number of Trips 100-250,Number of Trips 250-500,Number of Trips >=500,State
0,1.0,6307163.0,14287547.0,5.850097e+07,17576261.0,16495524.0,7496209.0,7903468.0,5709323.0,2317707.0,704279.0,187326.0,40494.0,70378.0,AK
1,1.0,28041066.0,108819322.0,4.022133e+08,90141352.0,101341754.0,54003092.0,64762466.0,60915550.0,20594042.0,7174644.0,2636396.0,545014.0,98992.0,AL
2,1.0,17585588.0,66801512.0,2.502319e+08,58626768.0,65947140.0,31781426.0,38396070.0,35735626.0,12471888.0,4908546.0,1917546.0,379184.0,67724.0,AR
3,1.0,56746120.0,144059968.0,5.123756e+08,146357458.0,124083598.0,58454178.0,70934148.0,74330274.0,24290644.0,8272018.0,4563512.0,755708.0,334094.0,AZ
4,1.0,317664842.0,789932418.0,2.593218e+09,688818389.0,684687597.0,296317707.0,356772431.0,357125194.0,137654612.0,48995443.0,17590719.0,3890374.0,1365278.0,CA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
964,18.0,1710704.0,4552286.0,1.891188e+07,3580023.0,4869895.0,2370965.0,2802106.0,3220018.0,1481427.0,425442.0,130816.0,25535.0,5657.0,VT
965,18.0,22115955.0,53239955.0,2.079825e+08,49296205.0,51810385.0,24247099.0,31877740.0,33102683.0,12089961.0,3757935.0,1387603.0,252533.0,160383.0,WA
966,18.0,16270289.0,41865391.0,1.694443e+08,36086818.0,44652004.0,21050210.0,25669560.0,24945273.0,9845169.0,4641455.0,2139162.0,341154.0,73481.0,WI
967,18.0,5046710.0,13011610.0,5.215250e+07,10282657.0,14230649.0,6453695.0,7797154.0,7812684.0,3254869.0,1687553.0,540366.0,81466.0,11407.0,WV


## Clean COVID-19 Data

In [5]:
# KEEP:
# DC: District of Columbia

# REMOVE:
# AS: American Samoa
# PR: Puerto Rico
# GU: Guam
# MP: Northern Mariana Islands
# VI: US Virgin Islands
remove_states = ['AS', 'PR', 'GU', 'MP', 'VI']

### prepare covid data

In [6]:
valuable_cols = ['state', 'date', 'death', 'deathConfirmed', 
                 'hospitalizedCurrently', 'positiveCasesViral', 
                 'positiveIncrease', 'totalTestsPeopleViral', 
                 'totalTestsPeopleViralIncrease', 'totalTestsViral']

In [7]:
# Remove undesired columns
covid_data = covid_19_data.copy()[valuable_cols]
# Convert string to datetime
covid_data['date'] = covid_data['date'].astype('datetime64[ns]')
# Convert target_ranges to list of 

### create Time Period column

In [8]:
ranges_list = ['4/23/2020-5/5/2020', '5/7/2020-5/12/2020', '5/14/2020-5/19/2020', '5/21/2020-5/26/2020',
               '5/28/2020-6/2/2020', '6/4/2020-6/9/2020', '6/11/2020-6/16/2020', '6/18/2020-6/23/2020',
               '6/25/2020-6/30/2020', '7/2/2020-7/7/2020', '7/9/2020-7/14/2020', '7/16/2020-7/21/2020',                  
               '8/19/2020-8/31/2020', '9/2/2020-9/14/2020', '9/16/2020-9/28/2020', '9/30/2020-10/12/2020', 
               '10/14/2020-10/26/2020', '10/28/2020-11/9/2020']

# Convert ranges_list to a list of lists of start and end datetimes
period_list = []
for r in ranges_list:
    r_list = []
    for date_str in r.split('-'):
        date = dt.datetime.strptime(date_str, '%m/%d/%Y').date()
        r_list.append(date)
    period_list.append(r_list)

# Create Time Period column
time_periods = [] # -1: before first period | -2: after last period | 0-n: respective period
for index, row in covid_data.iterrows():
    true_period = np.NaN
    if row['date'] < period_list[0][0]:
        true_period = -1
    elif row['date'] > period_list[-1][-1]:
        true_period = -2
    else:
        for period in period_list:
            if period[0] <= row['date'] <= period[1]:
                true_period = period_list.index(period) + 1
    time_periods.append(true_period)

# Add Time Period column to covid_data
covid_data['Period'] = time_periods

# Make a copy before cleaning
data = covid_data.copy()

# Remove rows with invalid time periods
data.drop(data.loc[data['Period'] == -1].index, inplace=True)
data.drop(data.loc[data['Period'] == -2].index, inplace=True)
data.drop(data.loc[data['Period'] == np.NaN].index, inplace=True)

In [9]:
states = list(set(data['state'].values))
final_states = [item for item in states if item not in remove_states]
len(final_states)

51

In [10]:
data = data.reset_index(drop=True)

### Fill NaN and groupby Time Period for each state

In [11]:
clean_data = []

data["death"] = data[["death", "deathConfirmed"]].max(axis=1)
data["positiveCasesViral"] = data[["positiveCasesViral", "totalTestsViral", "totalTestsPeopleViral"]].max(axis=1)
clean_df = data.drop(columns=["deathConfirmed", "totalTestsViral", "totalTestsPeopleViral"])

for state in final_states:
    state_data = clean_df.loc[clean_df['state'] == state].reset_index(drop=True)
    state_data['deathIncrease'] = state_data['death'].diff(+1)
    state_data['positiveCasesIncrease'] = state_data['positiveCasesViral'].diff(+1)

    state_data = state_data.groupby(['Period', 'state']).mean().reset_index()
    clean_data.append(state_data)
    
clean_df = pd.concat(clean_data)
clean_df = clean_df.dropna()
covid_df = clean_df.copy()

covid_df['State'] = covid_df['state']
covid_df = covid_df.drop('state', axis=1)

In [12]:
covid_df.to_csv("Clean_Data/clean_covid_data.csv", index= False)

In [13]:
covid_df

Unnamed: 0,Period,death,hospitalizedCurrently,positiveCasesViral,positiveIncrease,totalTestsPeopleViralIncrease,deathIncrease,positiveCasesIncrease,State
0,1.0,739.384615,1658.384615,2.744331e+04,946.153846,0.0,28.750000,952.083333,TX
1,2.0,1057.833333,1682.500000,3.827417e+04,1104.333333,0.0,30.833333,1104.333333,TX
2,3.0,1307.500000,1658.333333,6.443380e+05,1251.500000,0.0,35.166667,23895.000000,TX
3,4.0,1499.166667,1593.833333,7.721102e+05,872.833333,0.0,19.500000,16717.500000,TX
4,5.0,1650.166667,1726.333333,9.338550e+05,1441.166667,0.0,22.666667,21758.333333,TX
...,...,...,...,...,...,...,...,...,...
13,14.0,4312.692308,689.230769,2.445710e+06,1067.230769,0.0,19.538462,27146.153846,OH
14,15.0,4660.846154,613.461538,2.885383e+06,947.461538,0.0,18.461538,34268.538462,OH
15,16.0,4938.615385,777.307692,3.415677e+06,1328.615385,0.0,17.076923,39948.538462,OH
16,17.0,5118.461538,1208.384615,3.967684e+06,2200.384615,0.0,15.384615,41182.846154,OH


## Clean Anxiety/Depression Data

In [14]:
data = anxiety_depression_data.copy()

In [15]:
# Remove uneeded rows and columns
clean_data = data.loc[data['State'] != 'United States']
clean_data = clean_data.drop(['Phase', 'Group', 'Subgroup', 'Time Period Label',
                             'Low CI', 'High CI', 'Confidence Interval', 'Quartile range'], axis=1)

# Break each target into a dataframe
depression_data = clean_data.loc[clean_data['Indicator'] == 'Symptoms of Depressive Disorder']
anxiety_data = clean_data.loc[clean_data['Indicator'] == 'Symptoms of Anxiety Disorder']
both_data = clean_data.loc[clean_data['Indicator'] == 'Symptoms of Anxiety Disorder or Depressive Disorder']

# Merge each target into one dataframe on State and Time Period  
merged_data = pd.merge(depression_data, anxiety_data, on=['State', 'Time Period'])
merged_data = pd.merge(merged_data, both_data, on=['State', 'Time Period'])

# Clean final dataframe
merged_data = merged_data.drop(['Indicator_x', 'Indicator_y', 'Indicator'], axis=1)
merged_data.columns = ['State', 'Period', 'Depression_Score', 'Anxiety_Score', 'Mix_Score']


In [16]:
# Dictionary of states, used to change name -> acronym
states_hash = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Federated States Of Micronesia': 'FM',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Marshall Islands': 'MH',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': ' ',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands': 'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Palau': 'PW',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'}

In [17]:
# Change state name into acronym
merged_data['State'] = merged_data.apply(lambda x: states_hash[x['State']], axis=1)

In [18]:
label_df = merged_data.copy()

In [19]:
label_df.to_csv("Clean_Data/clean_label_data.csv", index= False)

# Merge datasets

In [20]:
# covid_df.join(trip_df, on=['Period', 'State'], how='left')
final_df = pd.merge(label_df, covid_df, on=['Period', 'State'])
final_df = pd.merge(final_df, trip_df, on=['Period', 'State'])


In [21]:
columns = ['State', 'Period', 'Depression_Score', 'Anxiety_Score', 'Mix_Score',
       'deathIncrease', 'death', 'hospitalizedCurrently', 'positiveCasesIncrease',
        'positiveCasesViral', 'totalTestsPeopleViralIncrease',
       'Population Staying at Home', 'Population Not Staying at Home',
       'Number of Trips', 'Number of Trips <1', 'Number of Trips 1-3',
       'Number of Trips 3-5', 'Number of Trips 5-10', 'Number of Trips 10-25',
       'Number of Trips 25-50', 'Number of Trips 50-100',
       'Number of Trips 100-250', 'Number of Trips 250-500',
       'Number of Trips >=500']


In [22]:
final_df = final_df[columns]

In [23]:
statePop = pd.read_csv("StatePop.csv")
statePop['State'] = statePop.apply(lambda x: states_hash[x['State']], axis=1)

In [24]:
final_df = pd.merge(final_df, statePop, how='inner', on=['State'])

In [25]:
features = final_df.loc[:,"deathIncrease":"Pop"]

In [26]:
for feature in features:
    final_df.loc[:,feature] = final_df.loc[:,feature]/final_df['Pop']

In [27]:
final_df = final_df.drop(columns=["Pop", "density"], axis=1)


In [28]:
final_df

Unnamed: 0,State,Period,Depression_Score,Anxiety_Score,Mix_Score,deathIncrease,death,hospitalizedCurrently,positiveCasesIncrease,positiveCasesViral,...,Number of Trips <1,Number of Trips 1-3,Number of Trips 3-5,Number of Trips 5-10,Number of Trips 10-25,Number of Trips 25-50,Number of Trips 50-100,Number of Trips 100-250,Number of Trips 250-500,Number of Trips >=500
0,AL,1,18.6,25.6,30.3,0.000002,0.000051,0.000092,0.000919,0.016521,...,18.363889,20.645671,11.001685,13.193620,12.409914,4.195485,1.461642,0.537095,0.111032,0.020167
1,AL,2,22.5,27.2,30.6,0.000003,0.000079,0.000096,0.000805,0.025495,...,9.633588,10.701244,5.832369,7.063924,6.676520,2.289233,0.827562,0.311962,0.069180,0.010925
2,AL,3,19.6,20.7,25.2,0.000002,0.000099,0.000101,0.000827,0.030795,...,10.222031,11.008505,5.970319,7.238895,6.895172,2.429710,0.879017,0.338662,0.079815,0.011400
3,AL,4,20.9,25.2,28.8,0.000002,0.000112,0.000104,0.000995,0.037385,...,9.582299,10.575117,5.755132,7.097441,6.701813,2.295237,0.882371,0.396369,0.097929,0.018569
4,AL,5,28.4,31.6,37.5,0.000002,0.000127,0.000115,0.000951,0.043797,...,9.757588,10.538668,5.664647,6.900937,6.577336,2.276705,0.864323,0.405511,0.085880,0.011446
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
848,KS,14,19.3,28.7,32.5,0.000002,0.000169,0.000085,0.001193,0.150166,...,21.054919,22.569022,9.641422,11.786638,12.316341,5.014209,2.146498,1.231252,0.294709,0.130185
849,KS,15,22.9,30.6,35.2,0.000003,0.000210,0.000100,0.001468,0.167277,...,20.674147,22.558298,9.746595,11.716151,12.151250,4.928880,2.111289,1.027574,0.212464,0.053351
850,KS,16,22.7,32.3,36.2,0.000004,0.000248,0.000131,0.001538,0.186863,...,19.287248,21.236292,9.165584,11.180588,11.768170,4.839855,2.191240,1.056929,0.217324,0.044964
851,KS,17,26.2,34.8,39.6,0.000005,0.000312,0.000136,0.001423,0.206418,...,20.284037,23.066331,9.835879,11.610815,11.836634,4.795822,2.141347,1.034027,0.186494,0.046568


In [29]:
final_df.to_csv("Clean_Data/final_data.csv", index= False)