In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.impute import SimpleImputer
from scipy import stats

In [2]:
df1 = pd.read_csv("country-wise-average.csv")
df2 = pd.read_csv("malnutrition-estimates.csv")

In [3]:
print("First 5 lines of data \n\n")
print(df1.head())
print("\n")
print(df2.head())

First 5 lines of data 


       Country  Income Classification  Severe Wasting    Wasting  Overweight  \
0  AFGHANISTAN                    0.0        3.033333  10.350000    5.125000   
1      ALBANIA                    2.0        4.075000   7.760000   20.800000   
2      ALGERIA                    2.0        2.733333   5.942857   12.833333   
3       ANGOLA                    1.0        2.400000   6.933333    2.550000   
4    ARGENTINA                    2.0        0.200000   2.150000   11.125000   

    Stunting  Underweight  U5 Population ('000s)  
0  47.775000    30.375000            4918.561500  
1  24.160000     7.700000             232.859800  
2  19.571429     7.342857            3565.213143  
3  42.633333    23.600000            3980.054000  
4  10.025000     2.600000            3613.651750  


   Unnamed: 0 ISO code      Country Survey Year  Year  Income Classification  \
0           0      AFG  AFGHANISTAN        1997  1997                      0   
1           1      AFG  AF

In [4]:
# Print data type
print("\n\n\nDatatype\n")
print(df1.dtypes)
print("\n")
print(df2.dtypes)




Datatype

Country                   object
Income Classification    float64
Severe Wasting           float64
Wasting                  float64
Overweight               float64
Stunting                 float64
Underweight              float64
U5 Population ('000s)    float64
dtype: object


Unnamed: 0                 int64
ISO code                  object
Country                   object
Survey Year               object
Year                       int64
Income Classification      int64
LDC                      float64
LIFD                     float64
LLDC or SID2             float64
Survey Sample (N)         object
Severe Wasting           float64
Wasting                  float64
Overweight               float64
Stunting                 float64
Underweight              float64
Notes                     object
Report Author             object
Source                    object
Short Source              object
U5 Population ('000s)    float64
dtype: object


In [5]:
# Print number of null values 
print("\n\n\nNumber of null values\n")
print(df1.isnull().sum())
print("\n")
print(df2.isnull().sum())




Number of null values

Country                   0
Income Classification     0
Severe Wasting           12
Wasting                   2
Overweight                3
Stunting                  1
Underweight               2
U5 Population ('000s)     0
dtype: int64


Unnamed: 0                 0
ISO code                   0
Country                    0
Survey Year                0
Year                       0
Income Classification      0
LDC                        0
LIFD                       0
LLDC or SID2               0
Survey Sample (N)         63
Severe Wasting           228
Wasting                   47
Overweight               136
Stunting                  37
Underweight               22
Notes                    597
Report Author              0
Source                     0
Short Source               0
U5 Population ('000s)      0
dtype: int64


In [6]:
# Print data summary
print("\n\n\nData summary\n")
print(df1.describe())
print("\n")
print(df2.describe())




Data summary

       Income Classification  Severe Wasting     Wasting  Overweight  \
count             152.000000      140.000000  150.000000  149.000000   
mean                1.427632        2.168650    6.599257    7.201638   
std                 0.967019        1.708939    4.481723    4.649144   
min                 0.000000        0.000000    0.000000    0.962500   
25%                 1.000000        0.900000    3.262500    3.850000   
50%                 1.000000        1.872500    5.710714    6.300000   
75%                 2.000000        2.822727    8.740476    9.080000   
max                 3.000000       11.400000   23.650000   26.500000   

         Stunting  Underweight  U5 Population ('000s)  
count  151.000000   150.000000             152.000000  
mean    25.814728    13.503047            4042.927052  
std     14.686807    10.895839           13164.191927  
min      1.000000     0.100000               1.000000  
25%     13.485000     4.305000             241.765813 

In [7]:
# Print data shape
print("\n\n\ndf1 shape\n")
print("df1 has {} rows and {} columns".format(df1.shape[0], df1.shape[1]))
print("\n")
print("df2 has {} rows and {} columns".format(df2.shape[0], df2.shape[1]))




df1 shape

df1 has 152 rows and 8 columns


df2 has 924 rows and 20 columns


In [8]:
df1 = df1.dropna(subset = ['Wasting', 'Overweight', 'Stunting', 'Underweight'])
df2 = df2.dropna(subset = ['Stunting', 'Underweight', 'Survey Sample (N)', 'Notes'])

imputer = SimpleImputer(missing_values = np.nan, strategy = 'median')
df1[['Severe Wasting']] = imputer.fit_transform(df1[['Severe Wasting']])
df2[['Severe Wasting', 'Wasting', 'Overweight']] = imputer.fit_transform(df2[['Severe Wasting', 'Wasting', 'Overweight']])

In [9]:
# Check for NaN
print(df1.isnull().sum())
print(df2.isnull().sum())

Country                  0
Income Classification    0
Severe Wasting           0
Wasting                  0
Overweight               0
Stunting                 0
Underweight              0
U5 Population ('000s)    0
dtype: int64
Unnamed: 0               0
ISO code                 0
Country                  0
Survey Year              0
Year                     0
Income Classification    0
LDC                      0
LIFD                     0
LLDC or SID2             0
Survey Sample (N)        0
Severe Wasting           0
Wasting                  0
Overweight               0
Stunting                 0
Underweight              0
Notes                    0
Report Author            0
Source                   0
Short Source             0
U5 Population ('000s)    0
dtype: int64
