In [173]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import preprocessing
from geopy.geocoders import Nominatim

### load the datasets

In [2]:
df_location = pd.read_csv("dataset/location.csv")

In [3]:
df_cases = pd.read_csv("dataset/individual.csv")

### info about individul cases dataset

In [4]:
df_cases.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 557364 entries, 0 to 557363
Data columns (total 10 columns):
age                       260490 non-null object
sex                       263630 non-null object
province                  550796 non-null object
country                   557340 non-null object
latitude                  557362 non-null float64
longitude                 557362 non-null float64
date_confirmation         556902 non-null object
additional_information    34395 non-null object
source                    348173 non-null object
outcome                   557364 non-null object
dtypes: float64(2), object(8)
memory usage: 42.5+ MB


In [5]:
df_cases.describe()

Unnamed: 0,latitude,longitude
count,557362.0,557362.0
mean,18.138385,27.285771
std,20.455801,67.577194
min,-54.80803,-159.727596
25%,11.04285,-58.47308
50%,19.03681,72.83483
75%,28.456,77.2091
max,70.0718,174.74


In [6]:
# out of total 557,364 values 260,490 values are not NULL in age column => 296,874 missing age values

### Data cleaning for age column

In [7]:
age_col = df_cases['age']
age_col = age_col.to_frame()
# age_col = age_col[age_col['age'].notna()]
# age_col.head()

In [8]:
age_col['format']= age_col['age'].str.extract(r'([0-9][0-9]-[0-9][0-9])')
a = age_col['format'].str.extract(r'([0-9][0-9])')
b = age_col['format'].str.extract(r'(-[0-9][0-9])')
b = b[0].str.extract(r'([0-9][0-9])')
b[0] = pd.to_numeric(b[0])
a[0] = pd.to_numeric(a[0])

new = round((a[0]+b[0])/2)
new = new.to_frame()
# new = new.dropna()
age_col['format'] = new[0]

In [9]:
age_col.head()

Unnamed: 0,age,format
0,,
1,21.0,
2,94.0,
3,,
4,2.0,


In [10]:
age_col['format2'] = age_col['age'].str.extract(r'([0-9]+)')
# here merge format2 into format and made format = age

age_col.format[age_col.format.isnull()] = age_col.format2
age_col.age = age_col.format
# age_col

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [11]:
type(age_col.age)
age_col.age= pd.to_numeric(age_col.age)

In [12]:
age_col.age.describe()

count    260490.000000
mean         43.701194
std          19.922561
min           0.000000
25%          28.000000
50%          44.000000
75%          57.000000
max         121.000000
Name: age, dtype: float64

In [13]:
total_na_values = len(age_col) - age_col.count()
total_na_values

age        296874
format     296874
format2    296874
dtype: int64

### Replace orignal age column with the new formatted age column

In [14]:
df_cases.age = age_col.age 
df_cases

Unnamed: 0,age,sex,province,country,latitude,longitude,date_confirmation,additional_information,source,outcome
0,,,Gujarat,India,23.027760,72.600270,15.04.2020,,https://gujcovid19.gujarat.gov.in/uploads/pres...,hospitalized
1,21.0,male,Moyobamba,Peru,-6.032710,-76.972300,09.05.2020,,,nonhospitalized
2,94.0,female,Lima,Peru,-12.043180,-77.028240,15.04.2020,,,nonhospitalized
3,,,Gujarat,India,23.027760,72.600270,22.05.2020,,https://www.deshgujarat.com/2020/05/22/gujarat...,hospitalized
4,2.0,female,Coronel Portillo,Peru,-8.409210,-74.555720,30.04.2020,,,nonhospitalized
5,29.0,female,Ica,Peru,-14.094020,-75.702840,28.04.2020,,,nonhospitalized
6,,,Delhi,India,28.614740,77.209100,01.06.2020,,https://twitter.com/ANI/status/126746073002384...,hospitalized
7,,,Maharashtra,India,18.940170,72.834830,31.05.2020,,https://arogya.maharashtra.gov.in/pdf/ncovidep...,hospitalized
8,47.0,female,Mecklenburg-Vorpommern,Germany,53.792330,13.801800,19.03.2020,,,nonhospitalized
9,,,West Bengal,India,23.797760,87.986406,11.04.2020,"As per MohFW update,",https://www.mohfw.gov.in/,hospitalized


### Explored Decision Tree Classifier to impute misssing age and sex values

In [15]:
df = df_cases
# df

In [16]:
df = df.drop(['additional_information'], axis=1)

In [17]:
df = df.drop(['source'], axis =1)

In [18]:
df

Unnamed: 0,age,sex,province,country,latitude,longitude,date_confirmation,outcome
0,,,Gujarat,India,23.027760,72.600270,15.04.2020,hospitalized
1,21.0,male,Moyobamba,Peru,-6.032710,-76.972300,09.05.2020,nonhospitalized
2,94.0,female,Lima,Peru,-12.043180,-77.028240,15.04.2020,nonhospitalized
3,,,Gujarat,India,23.027760,72.600270,22.05.2020,hospitalized
4,2.0,female,Coronel Portillo,Peru,-8.409210,-74.555720,30.04.2020,nonhospitalized
5,29.0,female,Ica,Peru,-14.094020,-75.702840,28.04.2020,nonhospitalized
6,,,Delhi,India,28.614740,77.209100,01.06.2020,hospitalized
7,,,Maharashtra,India,18.940170,72.834830,31.05.2020,hospitalized
8,47.0,female,Mecklenburg-Vorpommern,Germany,53.792330,13.801800,19.03.2020,nonhospitalized
9,,,West Bengal,India,23.797760,87.986406,11.04.2020,hospitalized


In [19]:
df_nan = df[df.isnull().any(axis=1)] #dataset age and sex to be predicted 
featutre_cols = ['latitide', 'longitude']

In [21]:

df_nan
# X_ = df_nan[feature_cols]
# X_ = X_.fillna(0)
# X_

Unnamed: 0,age,sex,province,country,latitude,longitude,date_confirmation,outcome
0,,,Gujarat,India,23.027760,72.600270,15.04.2020,hospitalized
3,,,Gujarat,India,23.027760,72.600270,22.05.2020,hospitalized
6,,,Delhi,India,28.614740,77.209100,01.06.2020,hospitalized
7,,,Maharashtra,India,18.940170,72.834830,31.05.2020,hospitalized
9,,,West Bengal,India,23.797760,87.986406,11.04.2020,hospitalized
13,,,,Turkey,39.102050,35.173550,05.05.2020,nonhospitalized
14,,,Gujarat,India,23.027760,72.600270,13.05.2020,recovered
17,,,Madhya Pradesh,India,22.716220,75.865120,16.04.2020,hospitalized
18,,,Bihar,India,25.424420,86.133670,28.05.2020,recovered
19,,,Maharashtra,India,18.940170,72.834830,29.05.2020,recovered


In [22]:
df = df.dropna()
# df

In [23]:
le = preprocessing.LabelEncoder()
data = df.apply(le.fit_transform)
feature_cols = ['latitude', 'longitude']
# data

In [24]:
X = data[feature_cols] # Features
y = data.sex #target variable
X.head()

Unnamed: 0,latitude,longitude
1,1261,753
2,814,733
4,1069,1299
5,580,1039
8,4737,2681


In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
# df_nan

In [26]:
clf = tree.DecisionTreeClassifier()

In [27]:
clf = clf.fit(X_train,y_train)

In [28]:
# data_ = df_nan.apply(le.fit_transform)
# data_
y_pred = clf.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.5522655191019517


In [29]:
# Y_ = clf.predict(X_)
# Y_

In [30]:
y_pred

array([1, 0, 1, ..., 0, 1, 1])

### Another approach to impute values

In [31]:
df_cases

Unnamed: 0,age,sex,province,country,latitude,longitude,date_confirmation,additional_information,source,outcome
0,,,Gujarat,India,23.027760,72.600270,15.04.2020,,https://gujcovid19.gujarat.gov.in/uploads/pres...,hospitalized
1,21.0,male,Moyobamba,Peru,-6.032710,-76.972300,09.05.2020,,,nonhospitalized
2,94.0,female,Lima,Peru,-12.043180,-77.028240,15.04.2020,,,nonhospitalized
3,,,Gujarat,India,23.027760,72.600270,22.05.2020,,https://www.deshgujarat.com/2020/05/22/gujarat...,hospitalized
4,2.0,female,Coronel Portillo,Peru,-8.409210,-74.555720,30.04.2020,,,nonhospitalized
5,29.0,female,Ica,Peru,-14.094020,-75.702840,28.04.2020,,,nonhospitalized
6,,,Delhi,India,28.614740,77.209100,01.06.2020,,https://twitter.com/ANI/status/126746073002384...,hospitalized
7,,,Maharashtra,India,18.940170,72.834830,31.05.2020,,https://arogya.maharashtra.gov.in/pdf/ncovidep...,hospitalized
8,47.0,female,Mecklenburg-Vorpommern,Germany,53.792330,13.801800,19.03.2020,,,nonhospitalized
9,,,West Bengal,India,23.797760,87.986406,11.04.2020,"As per MohFW update,",https://www.mohfw.gov.in/,hospitalized


In [32]:
df_cases = df_cases.drop(['additional_information', 'source'], axis=1)

In [33]:
df_cases.head()

Unnamed: 0,age,sex,province,country,latitude,longitude,date_confirmation,outcome
0,,,Gujarat,India,23.02776,72.60027,15.04.2020,hospitalized
1,21.0,male,Moyobamba,Peru,-6.03271,-76.9723,09.05.2020,nonhospitalized
2,94.0,female,Lima,Peru,-12.04318,-77.02824,15.04.2020,nonhospitalized
3,,,Gujarat,India,23.02776,72.60027,22.05.2020,hospitalized
4,2.0,female,Coronel Portillo,Peru,-8.40921,-74.55572,30.04.2020,nonhospitalized


In [34]:
df_cases.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 557364 entries, 0 to 557363
Data columns (total 8 columns):
age                  260490 non-null float64
sex                  263630 non-null object
province             550796 non-null object
country              557340 non-null object
latitude             557362 non-null float64
longitude            557362 non-null float64
date_confirmation    556902 non-null object
outcome              557364 non-null object
dtypes: float64(3), object(5)
memory usage: 34.0+ MB


### Age imputed with existing population proportion 

In [35]:
age_col['category_children'] = age_col.age < 15 
age_col['category_young_adults'] = age_col.age < 30
age_col['adults_below_50'] = age_col.age < 50
age_col['elderly'] = age_col.age > 50


In [36]:
children = age_col.age[age_col.age < 15].count()
children # 5.6% --> put 5.6% values as 8

14705

In [37]:
young_adults = age_col.age[age_col.age <30].count()
young_adults = young_adults - children
young_adults # 22% --> 22% values as 23

57526

In [38]:
adults_below_50 = age_col.age[age_col.age < 50].count()
adults_below_50 = adults_below_50 - young_adults - children
adults_below_50 # 37.5% --> 37.5% values as 40

97790

In [39]:
elderly_below_80 = age_col.age[age_col.age < 80].count()
elderly_below_80 = elderly_below_80 - adults_below_50 - young_adults - children
elderly_below_80 # 28.9% --> 29% values as 67

75375

In [40]:
very_elderly = age_col.age[age_col.age >= 80].count()
very_elderly # 5.7% --> values as 90

15094

In [49]:
# age_col

In [43]:
# age_col = age_col.drop(['format', 'format2', 'category', 'category_children', 'category_young_adults', 'adults_below_50', 'elderly'], axis=1)
# age_col

In [44]:
# 296,874 values to be imputed
# 16,625 - children
# 65,321 - young_adults
# 112,014 - adults < 50
# 86,093 - adults > 50 < 80
# 16,821 - elderly 

In [45]:
new_age = age_col.dropna()
# new_age

In [52]:
df_with_nan = age_col[age_col.isnull().any(axis=1)] # all data rows with NaN 
df_with_nan.head()
age_col = age_col.drop(['format', 'format2', 'category_children', 'category_young_adults', 'adults_below_50', 'elderly'], axis=1)
age_col.head()

Unnamed: 0,age
0,
1,21.0
2,94.0
3,
4,2.0


### Imputes age

In [53]:
count = 0
for index, row in age_col.iterrows():
        if (np.isnan(row.age)):
            count += 1
            if (count <= 16625):
                row['age'] = 8
            elif (count <= 81946):
                row['age'] = 23
            elif (count <= 193960):
                row['age'] = 40
            elif (count <= 280053):
                row['age'] = 67
            else:
                row['age'] = 90
            
# count 8 23 40 67 90
age_col

Unnamed: 0,age
0,8.0
1,21.0
2,94.0
3,8.0
4,2.0
5,29.0
6,8.0
7,8.0
8,47.0
9,8.0


In [69]:
# 16,625 - children
# 65,321 - young_adults
# 112,014 - adults < 50
# 86,093 - adults > 50 < 80
# 16,821 - elderly 
df_cases.age = age_col.age
df_cases

Unnamed: 0,age,sex,province,country,latitude,longitude,date_confirmation,outcome
0,8.0,,Gujarat,India,23.027760,72.600270,15.04.2020,hospitalized
1,21.0,male,Moyobamba,Peru,-6.032710,-76.972300,09.05.2020,nonhospitalized
2,94.0,female,Lima,Peru,-12.043180,-77.028240,15.04.2020,nonhospitalized
3,8.0,,Gujarat,India,23.027760,72.600270,22.05.2020,hospitalized
4,2.0,female,Coronel Portillo,Peru,-8.409210,-74.555720,30.04.2020,nonhospitalized
5,29.0,female,Ica,Peru,-14.094020,-75.702840,28.04.2020,nonhospitalized
6,8.0,,Delhi,India,28.614740,77.209100,01.06.2020,hospitalized
7,8.0,,Maharashtra,India,18.940170,72.834830,31.05.2020,hospitalized
8,47.0,female,Mecklenburg-Vorpommern,Germany,53.792330,13.801800,19.03.2020,nonhospitalized
9,8.0,,West Bengal,India,23.797760,87.986406,11.04.2020,hospitalized


### Imputing sex column 

In [113]:
df_cases.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 557364 entries, 0 to 557363
Data columns (total 8 columns):
age                  557364 non-null float64
sex                  263630 non-null object
province             550796 non-null object
country              557340 non-null object
latitude             557362 non-null float64
longitude            557362 non-null float64
date_confirmation    556902 non-null object
outcome              557364 non-null object
dtypes: float64(3), object(5)
memory usage: 34.0+ MB


In [114]:
sex_col = df_cases['sex']
type(sex_col)

pandas.core.series.Series

In [115]:
sex_col = sex_col.to_frame()

In [116]:
sex_col.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 557364 entries, 0 to 557363
Data columns (total 1 columns):
sex    263630 non-null object
dtypes: object(1)
memory usage: 4.3+ MB


In [117]:
sex_col

Unnamed: 0,sex
0,
1,male
2,female
3,
4,female


In [118]:
num_male = sex_col.sex[sex_col.sex == 'male'].count()
num_male

145583

In [119]:
num_female = sex_col.sex[sex_col.sex == 'female'].count()
num_female

118047

In [120]:
145583/263630 # 55% males and 45% females ---> 161,553 (males) and 132,181 (females)

0.5522247088722831

In [121]:
# total nan values in sex field = 293,734
print(sex_col.sex[0])

nan


In [123]:
count = 0
for index, row in sex_col.iterrows():
    if (pd.isnull(row['sex'])):
        count += 1
        if (count <= 161553):
            row['sex'] = 'male'
        else:
            row['sex'] = 'female'
sex_col

Unnamed: 0,sex
0,male
1,male
2,female
3,male
4,female
5,female
6,male
7,male
8,female
9,male


In [128]:
sex_col.describe()
# count

Unnamed: 0,sex
count,557364
unique,2
top,male
freq,307136


In [125]:
num_female = sex_col.sex[sex_col.sex == 'female'].count()
num_female

250228

In [126]:
num_male = sex_col.sex[sex_col.sex == 'male'].count()

In [127]:
num_male

307136

In [129]:
sex_col.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 557364 entries, 0 to 557363
Data columns (total 1 columns):
sex    557364 non-null object
dtypes: object(1)
memory usage: 4.3+ MB


In [131]:
df_cases.sex = sex_col.sex
df_cases

Unnamed: 0,age,sex,province,country,latitude,longitude,date_confirmation,outcome
0,8.0,male,Gujarat,India,23.027760,72.600270,15.04.2020,hospitalized
1,21.0,male,Moyobamba,Peru,-6.032710,-76.972300,09.05.2020,nonhospitalized
2,94.0,female,Lima,Peru,-12.043180,-77.028240,15.04.2020,nonhospitalized
3,8.0,male,Gujarat,India,23.027760,72.600270,22.05.2020,hospitalized
4,2.0,female,Coronel Portillo,Peru,-8.409210,-74.555720,30.04.2020,nonhospitalized
5,29.0,female,Ica,Peru,-14.094020,-75.702840,28.04.2020,nonhospitalized
6,8.0,male,Delhi,India,28.614740,77.209100,01.06.2020,hospitalized
7,8.0,male,Maharashtra,India,18.940170,72.834830,31.05.2020,hospitalized
8,47.0,female,Mecklenburg-Vorpommern,Germany,53.792330,13.801800,19.03.2020,nonhospitalized
9,8.0,male,West Bengal,India,23.797760,87.986406,11.04.2020,hospitalized


In [140]:
df_cases.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 557364 entries, 0 to 557363
Data columns (total 8 columns):
age                  557364 non-null float64
sex                  557364 non-null object
province             550796 non-null object
country              557340 non-null object
latitude             557362 non-null float64
longitude            557362 non-null float64
date_confirmation    556902 non-null object
outcome              557364 non-null object
dtypes: float64(3), object(5)
memory usage: 34.0+ MB


In [149]:
province_column = df_cases['province']
province_column

0                  Gujarat
1                Moyobamba
2                     Lima
3                  Gujarat
4         Coronel Portillo
                ...       
557359              Callao
557360         Maharashtra
557361         Maharashtra
557362          Tamil Nadu
557363           Rajasthan
Name: province, Length: 557364, dtype: object

In [153]:
province_column['country'] = df_cases['country']

Unnamed: 0,province,country
0,Gujarat,India
1,Moyobamba,Peru
2,Lima,Peru
3,Gujarat,India
4,Coronel Portillo,Peru
5,Ica,Peru
6,Delhi,India
7,Maharashtra,India
8,Mecklenburg-Vorpommern,Germany
9,West Bengal,India


In [156]:
province_column['latitude'] = df_cases['latitude']
province_column

Unnamed: 0,province,country,latitude
0,Gujarat,India,23.027760
1,Moyobamba,Peru,-6.032710
2,Lima,Peru,-12.043180
3,Gujarat,India,23.027760
4,Coronel Portillo,Peru,-8.409210
5,Ica,Peru,-14.094020
6,Delhi,India,28.614740
7,Maharashtra,India,18.940170
8,Mecklenburg-Vorpommern,Germany,53.792330
9,West Bengal,India,23.797760


In [157]:
province_column['longitude'] = df_cases['longitude']
province_column

Unnamed: 0,province,country,latitude,longitude
0,Gujarat,India,23.027760,72.600270
1,Moyobamba,Peru,-6.032710,-76.972300
2,Lima,Peru,-12.043180,-77.028240
3,Gujarat,India,23.027760,72.600270
4,Coronel Portillo,Peru,-8.409210,-74.555720
5,Ica,Peru,-14.094020,-75.702840
6,Delhi,India,28.614740,77.209100
7,Maharashtra,India,18.940170,72.834830
8,Mecklenburg-Vorpommern,Germany,53.792330,13.801800
9,West Bengal,India,23.797760,87.986406


In [161]:
df_nan_province = province_column[province_column.province.isnull()]


In [162]:
df_nan_province

Unnamed: 0,province,country,latitude,longitude
13,,Turkey,39.102050,35.173550
64,,Sudan,15.551770,32.532410
105,,Ghana,7.977254,-1.210600
281,,Philippines,11.816130,122.848400
321,,Philippines,11.816130,122.848400
568,,Philippines,11.816130,122.848400
774,,Philippines,11.816130,122.848400
816,,Philippines,11.816130,122.848400
916,,North Macedonia,41.595840,21.692690
953,,Singapore,1.353460,103.815100


In [174]:
geolocator = Nominatim(user_agent="myGeocoder")

In [180]:
temp = province_column[province_column.latitude.isnull()]
temp

Unnamed: 0,province,country,latitude,longitude
340945,,,,
365285,,,,


In [184]:
df_cases.iloc[340945]
df_cases.loc[365285]

age                               67
sex                           female
province                         NaN
country                          NaN
latitude                         NaN
longitude                        NaN
date_confirmation                NaN
outcome              nonhospitalized
Name: 365285, dtype: object

In [193]:
df_cases = df_cases.drop(340945, axis =0 )

In [198]:
df_cases = df_cases.drop(365285)

In [200]:
df_cases.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 557362 entries, 0 to 557363
Data columns (total 8 columns):
age                  557362 non-null float64
sex                  557362 non-null object
province             550796 non-null object
country              557340 non-null object
latitude             557362 non-null float64
longitude            557362 non-null float64
date_confirmation    556902 non-null object
outcome              557362 non-null object
dtypes: float64(3), object(5)
memory usage: 38.3+ MB


In [205]:
df_cases.to_csv("individual_dataset.csv")

In [208]:
df = pd.read_csv("individual_dataset.csv")
df = df.drop(columns=['Unnamed: 0'])
df

Unnamed: 0,age,sex,province,country,latitude,longitude,date_confirmation,outcome
0,8.0,male,Gujarat,India,23.027760,72.600270,15.04.2020,hospitalized
1,21.0,male,Moyobamba,Peru,-6.032710,-76.972300,09.05.2020,nonhospitalized
2,94.0,female,Lima,Peru,-12.043180,-77.028240,15.04.2020,nonhospitalized
3,8.0,male,Gujarat,India,23.027760,72.600270,22.05.2020,hospitalized
4,2.0,female,Coronel Portillo,Peru,-8.409210,-74.555720,30.04.2020,nonhospitalized
5,29.0,female,Ica,Peru,-14.094020,-75.702840,28.04.2020,nonhospitalized
6,8.0,male,Delhi,India,28.614740,77.209100,01.06.2020,hospitalized
7,8.0,male,Maharashtra,India,18.940170,72.834830,31.05.2020,hospitalized
8,47.0,female,Mecklenburg-Vorpommern,Germany,53.792330,13.801800,19.03.2020,nonhospitalized
9,8.0,male,West Bengal,India,23.797760,87.986406,11.04.2020,hospitalized


In [209]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 557362 entries, 0 to 557361
Data columns (total 8 columns):
age                  557362 non-null float64
sex                  557362 non-null object
province             550796 non-null object
country              557340 non-null object
latitude             557362 non-null float64
longitude            557362 non-null float64
date_confirmation    556902 non-null object
outcome              557362 non-null object
dtypes: float64(3), object(5)
memory usage: 34.0+ MB
