# Table of Contents

## 1.0 Setup
## 2.0 Data Cleaning
## 3.0 Exporting Data

## 1.0 Setup

In [1]:
# Importing libraries

import pandas as pd
import numpy as np
import os

In [2]:
# Defining the path

path = (r'E:\OneDrive\ANITA\DATA ANALYST\IMMERSION\Achievement 6\11-22 Food Balance Sheet Analysis\2.0 Data')

In [3]:
# Importing fbs_start.pkl as fbs

fbs = pd.read_pickle(os.path.join(path, '2.1 Original data', 'fbs_start.pkl'))

## 2.0 Data Cleaning

In [4]:
fbs.head()

Unnamed: 0,Domain Code,Domain,Area Code (M49),Area,Element Code,Element,Item Code (CPC),Item,Year Code,Year,Unit,Value,Flag,Flag Description
0,FBS,Food Balances (2010-),4,Afghanistan,511,Total Population - Both sexes,S2501,Population,2010,2010,1000 persons,29186.0,X,Figure from international organizations
1,FBS,Food Balances (2010-),4,Afghanistan,511,Total Population - Both sexes,S2501,Population,2011,2011,1000 persons,30117.0,X,Figure from international organizations
2,FBS,Food Balances (2010-),4,Afghanistan,511,Total Population - Both sexes,S2501,Population,2012,2012,1000 persons,31161.0,X,Figure from international organizations
3,FBS,Food Balances (2010-),4,Afghanistan,511,Total Population - Both sexes,S2501,Population,2013,2013,1000 persons,32270.0,X,Figure from international organizations
4,FBS,Food Balances (2010-),4,Afghanistan,511,Total Population - Both sexes,S2501,Population,2014,2014,1000 persons,33371.0,X,Figure from international organizations


In [5]:
fbs.shape

(3000600, 14)

In [6]:
fbs.nunique()

Domain Code             1
Domain                  1
Area Code (M49)       180
Area                  180
Element Code           18
Element                18
Item Code (CPC)        99
Item                   99
Year Code              10
Year                   10
Unit                    5
Value               29242
Flag                    3
Flag Description        3
dtype: int64

In [7]:
# Checking for mixed types

for col in fbs.columns.tolist():
    weird = (fbs[[col]].applymap(type) != fbs[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (fbs[weird]) > 0:
        print (col)


Unit
Flag
Flag Description


In [8]:
# Finding missing values in the fbs

fbs.isnull().sum()

Domain Code               0
Domain                    0
Area Code (M49)           0
Area                      0
Element Code              0
Element                   0
Item Code (CPC)           0
Item                      0
Year Code                 0
Year                      0
Unit                1184345
Value               1186000
Flag                1184345
Flag Description    1184345
dtype: int64

In [9]:
# Extracting the subset with the missing values from fbs

fbs_nan = fbs[fbs['Value'].isnull() == True]

In [10]:
fbs_nan

# Missing values were left as is

Unnamed: 0,Domain Code,Domain,Area Code (M49),Area,Element Code,Element,Item Code (CPC),Item,Year Code,Year,Unit,Value,Flag,Flag Description
40,FBS,Food Balances (2010-),4,Afghanistan,5911,Export Quantity,S2511,Wheat and products,2010,2010,,,,
41,FBS,Food Balances (2010-),4,Afghanistan,5911,Export Quantity,S2511,Wheat and products,2011,2011,,,,
42,FBS,Food Balances (2010-),4,Afghanistan,5911,Export Quantity,S2511,Wheat and products,2012,2012,,,,
43,FBS,Food Balances (2010-),4,Afghanistan,5911,Export Quantity,S2511,Wheat and products,2013,2013,,,,
94,FBS,Food Balances (2010-),4,Afghanistan,5131,Processing,S2511,Wheat and products,2014,2014,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199975,FBS,Food Balances (2010-),716,Zimbabwe,5171,Tourist consumption,S2899,Miscellaneous,2015,2015,,,,
199976,FBS,Food Balances (2010-),716,Zimbabwe,5171,Tourist consumption,S2899,Miscellaneous,2016,2016,,,,
199977,FBS,Food Balances (2010-),716,Zimbabwe,5171,Tourist consumption,S2899,Miscellaneous,2017,2017,,,,
199978,FBS,Food Balances (2010-),716,Zimbabwe,5171,Tourist consumption,S2899,Miscellaneous,2018,2018,,,,


In [11]:
# Creating a subset with the duplicated values from the fbs

fbs_duplicates = fbs[fbs.duplicated()]

In [12]:
fbs_duplicates

# No duplicates were found

Unnamed: 0,Domain Code,Domain,Area Code (M49),Area,Element Code,Element,Item Code (CPC),Item,Year Code,Year,Unit,Value,Flag,Flag Description


In [13]:
fbs.dtypes

Domain Code          object
Domain               object
Area Code (M49)       int64
Area                 object
Element Code          int64
Element              object
Item Code (CPC)      object
Item                 object
Year Code             int64
Year                  int64
Unit                 object
Value               float64
Flag                 object
Flag Description     object
dtype: object

In [14]:
# Deleting unnecessary columns

fbs = fbs.drop(columns = ['Domain Code', 'Domain', 'Area Code (M49)', 'Item Code (CPC)', 'Year Code', 'Flag', 'Flag Description'])

In [15]:
fbs.shape

(3000600, 7)

In [16]:
fbs.dtypes

Area             object
Element Code      int64
Element          object
Item             object
Year              int64
Unit             object
Value           float64
dtype: object

In [17]:
# Changing the data type of the column "Element Code"

fbs['Element Code'] = fbs['Element Code'].astype('str')

In [18]:
fbs.dtypes

Area             object
Element Code     object
Element          object
Item             object
Year              int64
Unit             object
Value           float64
dtype: object

In [19]:
fbs.describe()

Unnamed: 0,Year,Value
count,3000600.0,1814600.0
mean,2014.5,235.5983
std,2.872282,6620.04
min,2010.0,-19471.0
25%,2012.0,0.0
50%,2014.5,0.1
75%,2017.0,8.0
max,2019.0,1465634.0


In [20]:
fbs.head()

Unnamed: 0,Area,Element Code,Element,Item,Year,Unit,Value
0,Afghanistan,511,Total Population - Both sexes,Population,2010,1000 persons,29186.0
1,Afghanistan,511,Total Population - Both sexes,Population,2011,1000 persons,30117.0
2,Afghanistan,511,Total Population - Both sexes,Population,2012,1000 persons,31161.0
3,Afghanistan,511,Total Population - Both sexes,Population,2013,1000 persons,32270.0
4,Afghanistan,511,Total Population - Both sexes,Population,2014,1000 persons,33371.0


In [21]:
fbs.shape

(3000600, 7)

## 3.0 Exporting Data

In [22]:
# Exporting the fbs to fbs.pkl

fbs.to_pickle(os.path.join(path, '2.2 Prepared data', 'fbs.pkl'))