In [9]:
# import dependency
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

In [3]:
# read the raw data
df = pd.read_csv('../Data/covid_analytics_clinical_data.csv')

In [4]:
df.head();

In [5]:
# drop rows with no mortality and gender data 
wo_mortality = df.dropna(axis=0, subset=['Mortality', '% Male'])
wo_mortality;

In [6]:
# drop rows without value on all symptoms
dropped_no_symptoms = wo_mortality.dropna(subset=['Fever (temperature ≥37·3°C)', 'Cough', 
                                          'Shortness of Breath (dyspnoea)', 'Headache', 
                                          'Sputum (/Expectoration)', 'Myalgia (Muscle Pain)', 
                                          'Fatigue', 'Upper air-way congestion', 'Diarrhoea', 
                                          'Nausea or Vomiting', 'Loss of Appetite/Anorexia', 
                                          'Sore Throat/Stuffy Nose', 'Chills', 'Chest Pain', 
                                          'Loss of smell/taste', 'Disease Severity Asymptomatic'], how='all')
dropped_no_symptoms;

In [7]:
# create a new dataframe with selected columns
columns = ['ID','Country','Province/State','Study Pop Size (N)','Positive/negative cases',
         'Mean Age','% Male','% White or European','% African American','% Asian','% Hispanic or Latino',
          '% Multiple ethnicities or other','Smoking history','Hypertension,Diabetes',
         ' Cardiovascular Disease (incl. CAD)','Chronic obstructive lung (COPD)','Cancer (Any)',
          'Liver Disease (any)','Cerebrovascular Disease','Chronic kidney/renal disease,Other',
          'Fever (temperature ≥37·3°C)','Respiratory rate > 24 breaths per min','Cough',
          'Shortness of Breath (dyspnoea)','Headache,Sputum (/Expectoration)','Myalgia (Muscle Pain)',
          'Fatigue','Upper air-way congestion','Diarrhoea','Nausea or Vomiting','Loss of Appetite/Anorexia',
         ' Sore Throat/Stuffy Nose','Chills','Chest Pain','Loss of smell/taste','Disease Severity Asymptomatic',
          'Mortality']


new_df = dropped_no_symptoms.reindex(columns=columns)
new_df;

In [8]:
# master data
new_df.to_csv('../Data/raw_data_data.csv', index=False)

In [9]:
# Calculated for mean age based on median age, upper and lower quartiles on excel 
# Dropped if inadequate data to calculate mean age
# File renamed as raw_data.csv

In [15]:
# Read edited file 
df = pd.read_csv('../Data/raw_data.csv')

In [16]:
# grabbing all columns 
my_list = df.columns.values.tolist()
my_list;

In [17]:
# Find any Nan cells 
df.isnull();

In [18]:
# Fill with 0 values 
new_df=df.fillna(0.0)

In [20]:
new_df.dtypes;

In [22]:
# Find sum of mortality column and statistical summary 
new_df['Mortality'].sum()
new_df['Mortality'].describe();

In [23]:
# Convert mortality to binary values for logistic regression 
mortality_bin=[]
for i in new_df['Mortality']:
    if i>0.123: #using mean as value to discriminate between mortality of 1 or 0 
        mortality_bin.append(1)
    else:
        mortality_bin.append(0)
new_df['Mortality_bin']=mortality_bin

In [24]:
# New sum of mortality bin column to compare to previous mortality column sum
new_df['Mortality_bin'].sum()

288

In [26]:
new_df;

In [31]:
# select columns needed for data analysis
column_names = ['ID', 'Country', 'Province/State', 'Study Pop Size (N)', 'Positive/negative cases', 'Mean Age',
                '% Male','% White or European','% African American','% Asian','% Hispanic or Latino','% Multiple ethnicities or other','Smoking history', 'Hypertension', 'Diabetes', 'Cardiovascular Disease (incl. CAD)',
                'Chronic obstructive lung (COPD)', 'Cancer (Any)', 'Liver Disease (any)', 'Cerebrovascular Disease',
                'Chronic kidney/renal disease', 'Other', 'Fever (temperature ≥37·3°C)', 'Respiratory rate > 24 breaths per min',
                'Cough', 'Shortness of Breath (dyspnoea)', 'Headache', 'Sputum (/Expectoration)', 'Myalgia (Muscle Pain)',
                'Fatigue', 'Upper air-way congestion', 'Diarrhoea', 'Nausea or Vomiting', 'Loss of Appetite/Anorexia',
                'Sore Throat/Stuffy Nose', 'Chills', 'Chest Pain', 'Loss of smell/taste', 'Disease Severity Asymptomatic',
                'Mortality','Mortality_bin']
data = new_df.reindex(columns=column_names)
data;

In [28]:
# cleaned data ready for database 
data.to_csv('../Data/cleaned_data.csv', index=False)