# Task for Today  

***

## Predicting COVID-19 Mortality Based on Diet  

Given *data about various countries and their daily intakes of different food groups*, let's try to predict whether a given country will have a **high or low COVID-19 mortality rate**.

We will use a TensorFlow ANN to make our predictions.

# Getting Started

In [1]:
! pip install --upgrade numpy

Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.
Defaulting to user installation because normal site-packages is not writeable


In [2]:
import numpy as np
import pandas as pd
import plotly.express as px

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import tensorflow as tf

In [3]:
data = pd.read_csv('Food_Supply_Quantity_kg_Data.csv')

In [4]:
data

Unnamed: 0,Country,Alcoholic Beverages,Animal fats,Animal Products,"Aquatic Products, Other",Cereals - Excluding Beer,Eggs,"Fish, Seafood",Fruits - Excluding Wine,Meat,...,Vegetables,Vegetal Products,Obesity,Undernourished,Confirmed,Deaths,Recovered,Active,Population,Unit (all except Population)
0,Afghanistan,0.0014,0.1973,9.4341,0.0000,24.8097,0.2099,0.0350,5.3495,1.2020,...,6.7642,40.5645,4.5,29.8,0.142134,0.006186,0.123374,0.012574,38928000.0,%
1,Albania,1.6719,0.1357,18.7684,0.0000,5.7817,0.5815,0.2126,6.7861,1.8845,...,11.7753,31.2304,22.3,6.2,2.967301,0.050951,1.792636,1.123714,2838000.0,%
2,Algeria,0.2711,0.0282,9.6334,0.0000,13.6816,0.5277,0.2416,6.3801,1.1305,...,11.6484,40.3651,26.6,3.9,0.244897,0.006558,0.167572,0.070767,44357000.0,%
3,Angola,5.8087,0.0560,4.9278,0.0000,9.1085,0.0587,1.7707,6.0005,2.0571,...,2.3041,45.0722,6.8,25,0.061687,0.001461,0.056808,0.003419,32522000.0,%
4,Antigua and Barbuda,3.5764,0.0087,16.6613,0.0000,5.9960,0.2274,4.1489,10.7451,5.6888,...,5.4495,33.3233,19.1,,0.293878,0.007143,0.190816,0.095918,98000.0,%
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165,Venezuela (Bolivarian Republic of),2.5952,0.0403,14.7565,0.0000,12.9253,0.3389,0.9456,7.6460,3.8328,...,4.1474,35.2416,25.2,21.2,0.452585,0.004287,0.424399,0.023899,28645000.0,%
166,Vietnam,1.4591,0.1640,8.5765,0.0042,16.8740,0.3077,2.6392,5.9029,4.4382,...,11.9508,41.4232,2.1,9.3,0.002063,0.000036,0.001526,0.000501,96209000.0,%
167,Yemen,0.0364,0.0446,5.7874,0.0000,27.2077,0.2579,0.5240,5.1344,2.7871,...,3.2135,44.2126,14.1,38.9,0.007131,0.002062,0.004788,0.000282,29826000.0,%
168,Zambia,5.7360,0.0829,6.0197,0.0000,21.1938,0.3399,1.6924,1.0183,1.8427,...,3.4649,43.9789,6.5,46.7,0.334133,0.004564,0.290524,0.039045,18384000.0,%


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170 entries, 0 to 169
Data columns (total 32 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Country                       170 non-null    object 
 1   Alcoholic Beverages           170 non-null    float64
 2   Animal fats                   170 non-null    float64
 3   Animal Products               170 non-null    float64
 4   Aquatic Products, Other       170 non-null    float64
 5   Cereals - Excluding Beer      170 non-null    float64
 6   Eggs                          170 non-null    float64
 7   Fish, Seafood                 170 non-null    float64
 8   Fruits - Excluding Wine       170 non-null    float64
 9   Meat                          170 non-null    float64
 10  Milk - Excluding Butter       170 non-null    float64
 11  Miscellaneous                 170 non-null    float64
 12  Offals                        170 non-null    float64
 13  Oilcr

# Preprocessing

In [6]:
data = data.drop('Unit (all except Population)', axis=1)

## Missing Values

In [7]:
data.isna().sum()

Country                     0
Alcoholic Beverages         0
Animal fats                 0
Animal Products             0
Aquatic Products, Other     0
Cereals - Excluding Beer    0
Eggs                        0
Fish, Seafood               0
Fruits - Excluding Wine     0
Meat                        0
Milk - Excluding Butter     0
Miscellaneous               0
Offals                      0
Oilcrops                    0
Pulses                      0
Spices                      0
Starchy Roots               0
Stimulants                  0
Sugar & Sweeteners          0
Sugar Crops                 0
Treenuts                    0
Vegetable Oils              0
Vegetables                  0
Vegetal Products            0
Obesity                     3
Undernourished              7
Confirmed                   6
Deaths                      6
Recovered                   6
Active                      8
Population                  0
dtype: int64

In [8]:
for column in data.columns:
    if data.dtypes[column] != 'object' and data.isna().sum()[column] > 0:
        data[column] = data[column].fillna(data[column].mean())

### Dealing with the Undernourished column

In [9]:
data['Undernourished'].value_counts()

<2.5    44
2.7      3
16.5     3
7.1      3
6.2      3
        ..
7        1
2.5      1
41       1
29.4     1
30.7     1
Name: Undernourished, Length: 98, dtype: int64

In [10]:
undernourished_numeric = data.loc[data['Undernourished'] != '<2.5', 'Undernourished'].astype(np.float)
undernourished_numeric

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  """Entry point for launching an IPython kernel.


0      29.8
1       6.2
2       3.9
3      25.0
4       NaN
       ... 
165    21.2
166     9.3
167    38.9
168    46.7
169    51.3
Name: Undernourished, Length: 126, dtype: float64

In [11]:
undernourished_numeric = undernourished_numeric.fillna(undernourished_numeric.mean())
undernourished_numeric = pd.qcut(undernourished_numeric, q=3, labels=[1, 2, 3])
undernourished_numeric

0      3
1      1
2      1
3      3
4      2
      ..
165    3
166    2
167    3
168    3
169    3
Name: Undernourished, Length: 126, dtype: category
Categories (3, int64): [1 < 2 < 3]

In [12]:
data.loc[undernourished_numeric.index, 'Undernourished'] = undernourished_numeric

In [13]:
data['Undernourished'] = data['Undernourished'].apply(lambda x: 0 if x == '<2.5' else x)

In [14]:
data['Undernourished'].value_counts()

0    44
1    42
2    42
3    42
Name: Undernourished, dtype: int64

In [15]:
data

Unnamed: 0,Country,Alcoholic Beverages,Animal fats,Animal Products,"Aquatic Products, Other",Cereals - Excluding Beer,Eggs,"Fish, Seafood",Fruits - Excluding Wine,Meat,...,Vegetable Oils,Vegetables,Vegetal Products,Obesity,Undernourished,Confirmed,Deaths,Recovered,Active,Population
0,Afghanistan,0.0014,0.1973,9.4341,0.0000,24.8097,0.2099,0.0350,5.3495,1.2020,...,0.5345,6.7642,40.5645,4.5,3,0.142134,0.006186,0.123374,0.012574,38928000.0
1,Albania,1.6719,0.1357,18.7684,0.0000,5.7817,0.5815,0.2126,6.7861,1.8845,...,0.3261,11.7753,31.2304,22.3,1,2.967301,0.050951,1.792636,1.123714,2838000.0
2,Algeria,0.2711,0.0282,9.6334,0.0000,13.6816,0.5277,0.2416,6.3801,1.1305,...,1.0310,11.6484,40.3651,26.6,1,0.244897,0.006558,0.167572,0.070767,44357000.0
3,Angola,5.8087,0.0560,4.9278,0.0000,9.1085,0.0587,1.7707,6.0005,2.0571,...,0.6463,2.3041,45.0722,6.8,3,0.061687,0.001461,0.056808,0.003419,32522000.0
4,Antigua and Barbuda,3.5764,0.0087,16.6613,0.0000,5.9960,0.2274,4.1489,10.7451,5.6888,...,0.8102,5.4495,33.3233,19.1,2,0.293878,0.007143,0.190816,0.095918,98000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165,Venezuela (Bolivarian Republic of),2.5952,0.0403,14.7565,0.0000,12.9253,0.3389,0.9456,7.6460,3.8328,...,1.3734,4.1474,35.2416,25.2,3,0.452585,0.004287,0.424399,0.023899,28645000.0
166,Vietnam,1.4591,0.1640,8.5765,0.0042,16.8740,0.3077,2.6392,5.9029,4.4382,...,0.2201,11.9508,41.4232,2.1,2,0.002063,0.000036,0.001526,0.000501,96209000.0
167,Yemen,0.0364,0.0446,5.7874,0.0000,27.2077,0.2579,0.5240,5.1344,2.7871,...,1.0811,3.2135,44.2126,14.1,3,0.007131,0.002062,0.004788,0.000282,29826000.0
168,Zambia,5.7360,0.0829,6.0197,0.0000,21.1938,0.3399,1.6924,1.0183,1.8427,...,0.6657,3.4649,43.9789,6.5,3,0.334133,0.004564,0.290524,0.039045,18384000.0


## Feature and Target Selection

In [16]:
data = data.drop('Country', axis=1)

data = data.drop(['Confirmed', 'Recovered', 'Active'], axis=1)

In [17]:
pd.qcut(data['Deaths'], q=2, labels=[0, 1]).value_counts()

0    85
1    85
Name: Deaths, dtype: int64

In [18]:
data['Deaths'] = pd.qcut(data['Deaths'], q=2, labels=[0, 1])

## Splitting and Scaling

In [19]:
data

Unnamed: 0,Alcoholic Beverages,Animal fats,Animal Products,"Aquatic Products, Other",Cereals - Excluding Beer,Eggs,"Fish, Seafood",Fruits - Excluding Wine,Meat,Milk - Excluding Butter,...,Sugar & Sweeteners,Sugar Crops,Treenuts,Vegetable Oils,Vegetables,Vegetal Products,Obesity,Undernourished,Deaths,Population
0,0.0014,0.1973,9.4341,0.0000,24.8097,0.2099,0.0350,5.3495,1.2020,7.5828,...,1.3489,0.000,0.0770,0.5345,6.7642,40.5645,4.5,3,0,38928000.0
1,1.6719,0.1357,18.7684,0.0000,5.7817,0.5815,0.2126,6.7861,1.8845,15.7213,...,1.5367,0.000,0.1515,0.3261,11.7753,31.2304,22.3,1,1,2838000.0
2,0.2711,0.0282,9.6334,0.0000,13.6816,0.5277,0.2416,6.3801,1.1305,7.6189,...,1.8342,0.000,0.1152,1.0310,11.6484,40.3651,26.6,1,0,44357000.0
3,5.8087,0.0560,4.9278,0.0000,9.1085,0.0587,1.7707,6.0005,2.0571,0.8311,...,1.8495,0.000,0.0061,0.6463,2.3041,45.0722,6.8,3,0,32522000.0
4,3.5764,0.0087,16.6613,0.0000,5.9960,0.2274,4.1489,10.7451,5.6888,6.3663,...,3.8749,0.000,0.0253,0.8102,5.4495,33.3233,19.1,2,0,98000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165,2.5952,0.0403,14.7565,0.0000,12.9253,0.3389,0.9456,7.6460,3.8328,9.3920,...,3.4106,0.000,0.0009,1.3734,4.1474,35.2416,25.2,3,0,28645000.0
166,1.4591,0.1640,8.5765,0.0042,16.8740,0.3077,2.6392,5.9029,4.4382,0.6069,...,1.2846,0.815,0.3070,0.2201,11.9508,41.4232,2.1,2,0,96209000.0
167,0.0364,0.0446,5.7874,0.0000,27.2077,0.2579,0.5240,5.1344,2.7871,1.8911,...,5.0468,0.000,0.0017,1.0811,3.2135,44.2126,14.1,3,0,29826000.0
168,5.7360,0.0829,6.0197,0.0000,21.1938,0.3399,1.6924,1.0183,1.8427,1.7570,...,1.5632,0.000,0.0014,0.6657,3.4649,43.9789,6.5,3,0,18384000.0


In [20]:
y = data['Deaths']
X = data.drop('Deaths', axis=1)

In [22]:
scaler = StandardScaler()

X = scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

# Training

In [25]:
X.shape

(170, 26)

In [31]:
inputs = tf.keras.Input(shape=(26,))
x = tf.keras.layers.Dense(64, activation='relu')(inputs)
x = tf.keras.layers.Dense(64, activation='relu')(x)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs)


model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=[
        'accuracy',
        tf.keras.metrics.AUC(name='auc')
    ]
)


batch_size = 64
epochs = 14

history = model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    batch_size=batch_size,
    epochs=epochs,
    verbose=0
)

# Results

In [59]:
fig = px.line(
    history.history,
    y=['loss', 'val_loss'],
    labels={'index': "Epoch", 'value': "Loss"},
    title="Training and Validation Loss"
)

fig.show()

In [33]:
np.argmin(history.history['val_loss'])

13

In [34]:
model.evaluate(X_test, y_test)



[0.45731091499328613, 0.7647058963775635, 0.8526315689086914]

In [35]:
len(y_test)

34

In [57]:
fig.write_html('im.html')