## CA2 ##
## IRELAND LIVE STOCK AND MEAT COMPARED TO OTHER EU COUNTRIES

1. Step 1 : Exploratory Data Analysis
2. Step 2 : Statistics
3. Step 3 : ML Model around our data
4. Step 4 : Optimization with Python

In [1]:
#import of library for exploration of data
import pandas as pd
import numpy as np
import seaborn as sns
import math
import matplotlib.pyplot as plt

# We can suppress the warnings for a better reading
import warnings
warnings.filterwarnings('ignore')



# 1. Exploratory Data Analysis

In [2]:
#Source data generated from the link below
#https://www.fao.org/faostat/en/#data/QV

# Applied filter
# Country = Ireland, Year Only last 5 years including 2016,2017,2018,2019,2020 (2021,2022 data are not available yet)

#Read files from Eurostat
ir_production_value = "faostat/ireland_agriculture_production_value.csv"
ir_prod_value_df = pd.read_csv(ir_production_value)

#Quick Overview of the data
ir_prod_value_df.head()

Unnamed: 0,Domain Code,Domain,Area Code (M49),Area,Element Code,Element,Item Code (CPC),Item,Year Code,Year,Unit,Value,Flag,Flag Description
0,QV,Value of Agricultural Production,372,Ireland,152,Gross Production Value (constant 2014-2016 tho...,1341,Apples,2016,2016,1000 Int. $,10290,E,Estimated value
1,QV,Value of Agricultural Production,372,Ireland,152,Gross Production Value (constant 2014-2016 tho...,1341,Apples,2017,2017,1000 Int. $,10086,E,Estimated value
2,QV,Value of Agricultural Production,372,Ireland,152,Gross Production Value (constant 2014-2016 tho...,1341,Apples,2018,2018,1000 Int. $,9506,E,Estimated value
3,QV,Value of Agricultural Production,372,Ireland,152,Gross Production Value (constant 2014-2016 tho...,1341,Apples,2019,2019,1000 Int. $,9383,E,Estimated value
4,QV,Value of Agricultural Production,372,Ireland,152,Gross Production Value (constant 2014-2016 tho...,1341,Apples,2020,2020,1000 Int. $,9190,E,Estimated value


In [3]:
ir_prod_value_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 608 entries, 0 to 607
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Domain Code       608 non-null    object
 1   Domain            608 non-null    object
 2   Area Code (M49)   608 non-null    int64 
 3   Area              608 non-null    object
 4   Element Code      608 non-null    int64 
 5   Element           608 non-null    object
 6   Item Code (CPC)   608 non-null    object
 7   Item              608 non-null    object
 8   Year Code         608 non-null    int64 
 9   Year              608 non-null    int64 
 10  Unit              608 non-null    object
 11  Value             608 non-null    int64 
 12  Flag              608 non-null    object
 13  Flag Description  608 non-null    object
dtypes: int64(5), object(9)
memory usage: 66.6+ KB


In [4]:
#View rows and column count
ir_prod_value_df.shape

(608, 14)

In [5]:
# Check columns name if there is any irrelevant spelling
ir_prod_value_df.columns.values

array(['Domain Code', 'Domain', 'Area Code (M49)', 'Area', 'Element Code',
       'Element', 'Item Code (CPC)', 'Item', 'Year Code', 'Year', 'Unit',
       'Value', 'Flag', 'Flag Description'], dtype=object)

In [6]:
# Standardisation of column name -> Rename column name to keep a single uppercase to make easier our work with analysis. 
ir_prod_value_df.rename(columns = {"Domain Code" :"Domain_code", 
                                  "Domain": "Domain",
                                  "Area Code (M49)":"Area_code_m49",
                                  "Area":"Area",
                                  "Element Code":"Element_code",
                                  "Element": "Element",
                                  "Item Code (CPC)":"Item_code_cpc",
                                  "Item":"Item",
                                  "Year Code":"Year_code",
                                  "Year":"Year",
                                    "Unit": "Unit",
                                    "Value":"Value",
                                    "Flag":"Flag",
                                    "Flag Description" : "Flag_description"}, inplace = True)

In [7]:
# Let's view Unique Value in axes 0 -> In each Column
ir_prod_value_df.nunique(axis = 0)

Domain_code           1
Domain                1
Area_code_m49         1
Area                  1
Element_code          5
Element               5
Item_code_cpc        50
Item                 50
Year_code             5
Year                  5
Unit                  3
Value               542
Flag                  1
Flag_description      1
dtype: int64

In [8]:
#Removing column having a single value as these one wont help us to much for our analysis
#These columns are : Domain_code, Domain, Area_code_m49, Area, Flag, Flag_description
#First lets see differenet values that they contain

print("##### Domain_code: #####")
print(pd.unique(ir_prod_value_df['Domain_code']))
print("\n##### Domain : #####")
print(pd.unique(ir_prod_value_df['Domain']))
print("\n##### Area_code_m49 : #####")
print(pd.unique(ir_prod_value_df['Area_code_m49']))

print("##### Area: #####")
print(pd.unique(ir_prod_value_df['Area']))
print("\n##### Flag : #####")
print(pd.unique(ir_prod_value_df['Flag']))
print("\n##### Flag_descritpion : #####")
print(pd.unique(ir_prod_value_df['Flag_description']))


##### Domain_code: #####
['QV']

##### Domain : #####
['Value of Agricultural Production']

##### Area_code_m49 : #####
[372]
##### Area: #####
['Ireland']

##### Flag : #####
['E']

##### Flag_descritpion : #####
['Estimated value']


In [9]:
#Removal of columns
ir_prod_value_df = ir_prod_value_df.drop(['Domain_code','Domain','Area_code_m49','Area','Flag','Flag_description'], axis=1)

In [10]:
#Check also the value in Element and Element_code
print("\n##### Element : #####")
print(pd.unique(ir_prod_value_df['Element']))
print("\n##### Element_code : #####")
print(pd.unique(ir_prod_value_df['Element_code']))
print("\n##### Unit : #####")
print(pd.unique(ir_prod_value_df['Unit']))


# Definitions and standards used in FAOSTAT 
# I$ = international dollar
# SLC = standard local currency



##### Element : #####
['Gross Production Value (constant 2014-2016 thousand I$)'
 'Gross Production Value (current thousand SLC)'
 'Gross Production Value (constant 2014-2016 thousand SLC)'
 'Gross Production Value (current thousand US$)'
 'Gross Production Value (constant 2014-2016 thousand US$)']

##### Element_code : #####
[152  56  55  57  58]

##### Unit : #####
['1000 Int. $' '1000 SLC' '1000 US$']


In [11]:
## For simplification purpose we consider that the Unit is the US dollar, and SLC for Ireland is 1 Euro = 1 Dollar.
## Lets Remove unnecessary Column: Element, Element_code, Year_code
ir_prod_value_df = ir_prod_value_df.drop(['Element','Element_code','Year_code'], axis=1)
ir_prod_value_df.head()

Unnamed: 0,Item_code_cpc,Item,Year,Unit,Value
0,1341,Apples,2016,1000 Int. $,10290
1,1341,Apples,2017,1000 Int. $,10086
2,1341,Apples,2018,1000 Int. $,9506
3,1341,Apples,2019,1000 Int. $,9383
4,1341,Apples,2020,1000 Int. $,9190


In [12]:
## Check values in Item
print("\n##### Item : #####")
print(pd.unique(ir_prod_value_df['Item']))


##### Item : #####
['Apples' 'Barley' 'Broad beans and horse beans, dry'
 'Broad beans and horse beans, green' 'Cabbages' 'Carrots and turnips'
 'Cauliflowers and broccoli' 'Cereals n.e.c.'
 'Chillies and peppers, green (Capsicum spp. and Pimenta spp.)'
 'Cucumbers and gherkins' 'Currants' 'Hen eggs in shell, fresh'
 'Hop cones' 'Horse meat, fresh or chilled (indigenous)'
 'Leeks and other alliaceous vegetables' 'Lettuce and chicory'
 'Meat of cattle with the bone, fresh or chilled'
 'Meat of cattle with the bone, fresh or chilled (indigenous)'
 'Meat of chickens, fresh or chilled'
 'Meat of chickens, fresh or chilled (indigenous)'
 'Meat of ducks, fresh or chilled (indigenous)'
 'Meat of geese, fresh or chilled (indigenous)'
 'Meat of pig with the bone, fresh or chilled'
 'Meat of pig with the bone, fresh or chilled (indigenous)'
 'Meat of sheep, fresh or chilled'
 'Meat of sheep, fresh or chilled (indigenous)'
 'Meat of turkeys, fresh or chilled (indigenous)' 'Mushrooms and truffles

In [13]:
#Check if we need to remove rows with nan values 
print(ir_prod_value_df.isnull().sum())

Item_code_cpc    0
Item             0
Year             0
Unit             0
Value            0
dtype: int64


In [14]:
#Last Check to view if we are ready to plot all these and start our analysis
ir_prod_value_df.count()

Item_code_cpc    608
Item             608
Year             608
Unit             608
Value            608
dtype: int64

In [15]:
# Import interactive Altair library
import altair as alt

# First Overview of the Interactive View for 2020
ir_prod_2020 = ir_prod_value_df.loc[ir_prod_value_df['Year']==2020]

alt.Chart(ir_prod_2020).mark_point().encode(
    alt.X('Item_code_cpc'),
    alt.Y('Value')
)



In [19]:
select_year = alt.selection_single(
    name='select', fields=['Year'], init={'Year': 2016},
    bind=alt.binding_range(min=2016, max=2020, step=5)
)
alt.Chart(ir_prod_value_df).mark_point(filled=True).encode(
    alt.X('Item_code_cpc', scale=alt.Scale(zero=False)),
    alt.Y('Value', scale=alt.Scale(zero=False)),
    
).add_selection(select_year).transform_filter(select_year)

In [17]:
#Quick Overview of different value contained in each column 
# Visualize different value we have for some attributes to understand how we can progress in our anlysis.


print("\n##### Movement values Are : #####")
print(pd.unique(df['Movement']))
print("\n##### Date values Are : #####")
print(pd.unique(df['Date']))
print("\n##### Time values Are : #####")
print(pd.unique(df['Time']))
print("\n##### Count_type values Are : #####")
print(pd.unique(df['Count_type']))
print("\n##### Arm_crossed values Are : #####")
print(pd.unique(df['Arm_crossed']))
print("\n##### Arm_location values Are : #####")
print(pd.unique(df['Arm_location']))



##### Movement values Are : #####


NameError: name 'df' is not defined

In [None]:
#Plot production of Agriculture for the last 5 Years. 
#Display data per year.
plt.rcParams["figure.dpi"] = 164 #Customizing sizing of display
ir_prod_count = ir_prod_value_df.groupby("Item_code_cpc").sum()
ir_prod_count.plot(kind="bar", title="PRODUCTION", color ="r", xlabel="ITEM_CPC_CODE", ylabel="VALUE")


