In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import preprocessing

### load the datasets

In [2]:
df_location = pd.read_csv("dataset/location.csv")

In [3]:
df_cases = pd.read_csv("dataset/individual.csv")

### info about individul cases dataset

In [4]:
df_cases.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 557364 entries, 0 to 557363
Data columns (total 10 columns):
age                       260490 non-null object
sex                       263630 non-null object
province                  550796 non-null object
country                   557340 non-null object
latitude                  557362 non-null float64
longitude                 557362 non-null float64
date_confirmation         556902 non-null object
additional_information    34395 non-null object
source                    348173 non-null object
outcome                   557364 non-null object
dtypes: float64(2), object(8)
memory usage: 42.5+ MB


In [5]:
df_cases.describe()

Unnamed: 0,latitude,longitude
count,557362.0,557362.0
mean,18.138385,27.285771
std,20.455801,67.577194
min,-54.80803,-159.727596
25%,11.04285,-58.47308
50%,19.03681,72.83483
75%,28.456,77.2091
max,70.0718,174.74


In [6]:
# out of total 557,364 values 260,490 values are not NULL in age column => 296,874 missing age values

### Data cleaning for age column

In [7]:
age_col = df_cases['age']
age_col = age_col.to_frame()
# age_col = age_col[age_col['age'].notna()]
# age_col.head()

In [8]:
age_col['format']= age_col['age'].str.extract(r'([0-9][0-9]-[0-9][0-9])')
a = age_col['format'].str.extract(r'([0-9][0-9])')
b = age_col['format'].str.extract(r'(-[0-9][0-9])')
b = b[0].str.extract(r'([0-9][0-9])')
b[0] = pd.to_numeric(b[0])
a[0] = pd.to_numeric(a[0])

new = round((a[0]+b[0])/2)
new = new.to_frame()
# new = new.dropna()
age_col['format'] = new[0]

In [9]:
age_col.head()

Unnamed: 0,age,format
0,,
1,21.0,
2,94.0,
3,,
4,2.0,


In [10]:
age_col['format2'] = age_col['age'].str.extract(r'([0-9]+)')
# here merge format2 into format and made format = age

age_col.format[age_col.format.isnull()] = age_col.format2
age_col.age = age_col.format
# age_col

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [11]:
type(age_col.age)
age_col.age= pd.to_numeric(age_col.age)

In [12]:
age_col.age.describe()

count    260490.000000
mean         43.701194
std          19.922561
min           0.000000
25%          28.000000
50%          44.000000
75%          57.000000
max         121.000000
Name: age, dtype: float64

In [13]:
total_na_values = len(age_col) - age_col.count()
total_na_values

age        296874
format     296874
format2    296874
dtype: int64

### Replace orignal age column with the new formatted age column

In [14]:
df_cases.age = age_col.age 
df_cases

Unnamed: 0,age,sex,province,country,latitude,longitude,date_confirmation,additional_information,source,outcome
0,,,Gujarat,India,23.027760,72.600270,15.04.2020,,https://gujcovid19.gujarat.gov.in/uploads/pres...,hospitalized
1,21.0,male,Moyobamba,Peru,-6.032710,-76.972300,09.05.2020,,,nonhospitalized
2,94.0,female,Lima,Peru,-12.043180,-77.028240,15.04.2020,,,nonhospitalized
3,,,Gujarat,India,23.027760,72.600270,22.05.2020,,https://www.deshgujarat.com/2020/05/22/gujarat...,hospitalized
4,2.0,female,Coronel Portillo,Peru,-8.409210,-74.555720,30.04.2020,,,nonhospitalized
5,29.0,female,Ica,Peru,-14.094020,-75.702840,28.04.2020,,,nonhospitalized
6,,,Delhi,India,28.614740,77.209100,01.06.2020,,https://twitter.com/ANI/status/126746073002384...,hospitalized
7,,,Maharashtra,India,18.940170,72.834830,31.05.2020,,https://arogya.maharashtra.gov.in/pdf/ncovidep...,hospitalized
8,47.0,female,Mecklenburg-Vorpommern,Germany,53.792330,13.801800,19.03.2020,,,nonhospitalized
9,,,West Bengal,India,23.797760,87.986406,11.04.2020,"As per MohFW update,",https://www.mohfw.gov.in/,hospitalized


### Explored Decision Tree Classifier to impute misssing age and sex values

In [15]:
df = df_cases
# df

In [16]:
df = df.drop(['additional_information'], axis=1)

In [17]:
df = df.drop(['source'], axis =1)

In [18]:
df

Unnamed: 0,age,sex,province,country,latitude,longitude,date_confirmation,outcome
0,,,Gujarat,India,23.027760,72.600270,15.04.2020,hospitalized
1,21.0,male,Moyobamba,Peru,-6.032710,-76.972300,09.05.2020,nonhospitalized
2,94.0,female,Lima,Peru,-12.043180,-77.028240,15.04.2020,nonhospitalized
3,,,Gujarat,India,23.027760,72.600270,22.05.2020,hospitalized
4,2.0,female,Coronel Portillo,Peru,-8.409210,-74.555720,30.04.2020,nonhospitalized
5,29.0,female,Ica,Peru,-14.094020,-75.702840,28.04.2020,nonhospitalized
6,,,Delhi,India,28.614740,77.209100,01.06.2020,hospitalized
7,,,Maharashtra,India,18.940170,72.834830,31.05.2020,hospitalized
8,47.0,female,Mecklenburg-Vorpommern,Germany,53.792330,13.801800,19.03.2020,nonhospitalized
9,,,West Bengal,India,23.797760,87.986406,11.04.2020,hospitalized


In [23]:
df_nan = df[df.isnull().any(axis=1)] #dataset age and sex to be predicted 
featutre_cols = ['latitide', 'longitude']

In [26]:

df_nan
# X_ = df_nan[feature_cols]
# X_ = X_.fillna(0)
X_

NameError: name 'X_' is not defined

In [27]:
df = df.dropna()
# df

In [30]:
le = preprocessing.LabelEncoder()
data = df.apply(le.fit_transform)
feature_cols = ['latitude', 'longitude']
# data

In [31]:
X = data[feature_cols] # Features
y = data.sex #target variable
X.head()

Unnamed: 0,latitude,longitude
1,1261,753
2,814,733
4,1069,1299
5,580,1039
8,4737,2681


In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
# df_nan

In [33]:
clf = tree.DecisionTreeClassifier()

In [34]:
clf = clf.fit(X_train,y_train)

In [35]:
# data_ = df_nan.apply(le.fit_transform)
# data_
y_pred = clf.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.5522260386645084


In [37]:
# Y_ = clf.predict(X_)
# Y_

In [38]:
y_pred

array([1, 0, 1, ..., 0, 1, 1])

### Another approach to impute values

In [39]:
df_cases

Unnamed: 0,age,sex,province,country,latitude,longitude,date_confirmation,additional_information,source,outcome
0,,,Gujarat,India,23.027760,72.600270,15.04.2020,,https://gujcovid19.gujarat.gov.in/uploads/pres...,hospitalized
1,21.0,male,Moyobamba,Peru,-6.032710,-76.972300,09.05.2020,,,nonhospitalized
2,94.0,female,Lima,Peru,-12.043180,-77.028240,15.04.2020,,,nonhospitalized
3,,,Gujarat,India,23.027760,72.600270,22.05.2020,,https://www.deshgujarat.com/2020/05/22/gujarat...,hospitalized
4,2.0,female,Coronel Portillo,Peru,-8.409210,-74.555720,30.04.2020,,,nonhospitalized
5,29.0,female,Ica,Peru,-14.094020,-75.702840,28.04.2020,,,nonhospitalized
6,,,Delhi,India,28.614740,77.209100,01.06.2020,,https://twitter.com/ANI/status/126746073002384...,hospitalized
7,,,Maharashtra,India,18.940170,72.834830,31.05.2020,,https://arogya.maharashtra.gov.in/pdf/ncovidep...,hospitalized
8,47.0,female,Mecklenburg-Vorpommern,Germany,53.792330,13.801800,19.03.2020,,,nonhospitalized
9,,,West Bengal,India,23.797760,87.986406,11.04.2020,"As per MohFW update,",https://www.mohfw.gov.in/,hospitalized


In [41]:
df_cases = df_cases.drop(['additional_information', 'source'], axis=1)

In [42]:
df_cases

Unnamed: 0,age,sex,province,country,latitude,longitude,date_confirmation,outcome
0,,,Gujarat,India,23.027760,72.600270,15.04.2020,hospitalized
1,21.0,male,Moyobamba,Peru,-6.032710,-76.972300,09.05.2020,nonhospitalized
2,94.0,female,Lima,Peru,-12.043180,-77.028240,15.04.2020,nonhospitalized
3,,,Gujarat,India,23.027760,72.600270,22.05.2020,hospitalized
4,2.0,female,Coronel Portillo,Peru,-8.409210,-74.555720,30.04.2020,nonhospitalized
5,29.0,female,Ica,Peru,-14.094020,-75.702840,28.04.2020,nonhospitalized
6,,,Delhi,India,28.614740,77.209100,01.06.2020,hospitalized
7,,,Maharashtra,India,18.940170,72.834830,31.05.2020,hospitalized
8,47.0,female,Mecklenburg-Vorpommern,Germany,53.792330,13.801800,19.03.2020,nonhospitalized
9,,,West Bengal,India,23.797760,87.986406,11.04.2020,hospitalized


In [43]:
df_cases.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 557364 entries, 0 to 557363
Data columns (total 8 columns):
age                  260490 non-null float64
sex                  263630 non-null object
province             550796 non-null object
country              557340 non-null object
latitude             557362 non-null float64
longitude            557362 non-null float64
date_confirmation    556902 non-null object
outcome              557364 non-null object
dtypes: float64(3), object(5)
memory usage: 34.0+ MB


In [59]:
age_col['category_children'] = age_col.age < 15 
age_col['category_young_adults'] = age_col.age < 30
age_col['adults_below_50'] = age_col.age < 50
age_col['elderly'] = age_col.age > 50


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 557364 entries, 0 to 557363
Data columns (total 8 columns):
age                      260490 non-null float64
format                   260490 non-null object
format2                  260490 non-null object
category                 557364 non-null bool
category_children        557364 non-null bool
category_young_adults    557364 non-null bool
adults_below_50          557364 non-null bool
elderly                  557364 non-null bool
dtypes: bool(5), float64(1), object(2)
memory usage: 15.4+ MB


In [81]:
children = age_col.age[age_col.age < 15].count()
children # 5.6% --> put 5.6% values as 8

14705

In [82]:
young_adults = age_col.age[age_col.age <30].count()
young_adults = young_adults - children
young_adults # 22% --> 22% values as 23

57526

In [83]:
adults_below_50 = age_col.age[age_col.age < 50].count()
adults_below_50 = adults_below_50 - young_adults - children
adults_below_50 # 37.5% --> 37.5% values as 40

97790

In [84]:
elderly_below_80 = age_col.age[age_col.age < 80].count()
elderly_below_80 = elderly_below_80 - adults_below_50 - young_adults - children
elderly_below_80 # 28.9% --> 29% values as 67

75375

In [86]:
very_elderly = age_col.age[age_col.age >= 80].count()
very_elderly # 5.7% --> values as 90

15094

In [87]:
age_col

Unnamed: 0,age,format,format2,category,category_children,category_young_adults,adults_below_50,elderly
0,,,,False,False,False,False,False
1,21.0,21,21,False,False,True,True,False
2,94.0,94,94,False,False,False,False,True
3,,,,False,False,False,False,False
4,2.0,2,2,True,True,True,True,False
5,29.0,29,29,False,False,True,True,False
6,,,,False,False,False,False,False
7,,,,False,False,False,False,False
8,47.0,47,35,False,False,False,True,False
9,,,,False,False,False,False,False


In [89]:
age_col = age_col.drop(['format', 'format2', 'category', 'category_children', 'category_young_adults', 'adults_below_50', 'elderly'], axis=1)
age_col

Unnamed: 0,age
0,
1,21.0
2,94.0
3,
4,2.0
5,29.0
6,
7,
8,47.0
9,


In [None]:
# 296,874 values to be imputed
# 16,625 - children
# 65,321 - young_adults
# 112,014 - adults < 50
# 86,093 - adults > 50 < 80
# 16,821 - elderly 

296874

In [None]:
df_with_nan = df[df.isnull().any(axis=1)] # all data rows with NaN 

In [None]:
df_with_nan
df = df.dropna()
df #<------ data with all existing values