In [6]:
pip install missingno

Collecting missingno
  Downloading missingno-0.5.1-py3-none-any.whl (8.7 kB)
Installing collected packages: missingno
Successfully installed missingno-0.5.1
Note: you may need to restart the kernel to use updated packages.


# Project
## Analysis and comparison of the Irish Beef Sector with leading EU countries

# Importing libraries and setting warnings and display 

In [7]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import csv
import warnings
import glob
import requests
from countrygroups import EUROPEAN_UNION
from countryinfo import CountryInfo
from functools import partial, reduce 
import missingno as msno
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
import fancyimpute
from scipy.stats import ks_2samp
import pycountry

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import LinearSVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV

pd.options.mode.use_inf_as_na = True

#managing warnings(ignoring them mostly)
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)     
warnings.filterwarnings('ignore')

#setting display options
pd.set_option('display.max_columns', 200)
plt.rcParams['figure.figsize'] = (25, 14)
plt.style.use('seaborn-whitegrid')

from matplotlib.pyplot import cm
color = 'tab20c'

ModuleNotFoundError: No module named 'fancyimpute'

## Functions files_to_df, process and reformat  defined 

In [None]:
#function with argument of folder followed by units.
def files_to_df(path, col_name):
    csv_files = glob.glob(path + "/*.csv")
    df_list = (process(filename, col_name) for filename in csv_files)
    df = pd.concat(df_list, ignore_index=True)
    return df

#Formatting function for shaping data
def process(file, col_name):
    df = pd.read_csv(file,skiprows=1)      
    df.rename(columns={'Unnamed: 0': 'Year'}, inplace=True)
    #add acolumn that contains country code in each cell
    df['key'] = df.columns[1] + df['Year'].astype(str)
    df.rename(columns={df.columns[1]: col_name}, inplace=True)
    df = df.filter(['key', col_name] )
    return df

# Column splitting function ( more transparent than pivot!)
def reformat(dataframe, column):
    container = {} #create an empty dictionary to store the dataframes
    for i in dataframe[column].unique(): #loop through the unique values in the column of interest
        container[f'{i}'] = dataframe[(dataframe[column] == i) & (dataframe['Area'].isin(countries))] #create a dataframe for each unique value in the column of interest
        container[f'{i}']['key'] = container[f'{i}']['Area'] + container[f'{i}']['Year'].astype(str)   #create a key column to merge the dataframes later

    for i in container: #loop through dataframes in the dict and apply some conditions
        container[i] = container[i][['key', 'Area', 'Value']]#filter the dataframes to only include the key, area and value columns
        container[i].rename(columns={'Value': f'{i}'}, inplace=True)#rename the value column to the name of the item


    my_reduce = partial(pd.merge, on=['key', 'Area'], how='right')  #create a function to merge the dataframes in the dict                                                           
    df =  reduce(my_reduce, container.values()) #calling that function on the values in the dict
    return df


## Preparation of cattle, land, price, export, manure, fertiliser, rain and temperature data

csv files in the rawdata folder were downloaded from FAOSTAT.org

In [None]:
# importing and preparing cattle FAOSTAT data
cattle_global_df = pd.read_csv('rawdata/cattle_global.csv') #read
cattle_df= cattle_global_df[(cattle_global_df['Area'].isin(EUROPEAN_UNION.names))] #European contries only
cattle_df['key'] = cattle_df['Area'].astype(str) + cattle_df['Year'].astype(str) # key for merging later
cattle_df.to_csv('data/cattle.csv', index=False) # write csv to data folder
cattle_df = cattle_df[['key', 'Area','Value']]
cattle_df  # Take a look or comment out

## Temporary and permanent pasture area (land_df) is obviously going  to be an important predictor variable  of beef production and so is analysed, prepared and merged with the head of cattle data below.

In [None]:

#List of the eu countries
countries = cattle_df['Area'].unique().tolist()
lands_df = pd.read_csv('rawdata/lands.csv')     #read raw land data
#calling the function on the lands_df data and renamimng land_df
land_df = reformat(lands_df, 'Item')
#replace NaN with 0 in the temporary pasture column,and interpret as a zero value
land_df['Land under temp. meadows and pastures'].fillna(0, inplace=True) 
#rename specific long column names
land_df.rename(columns = {'Land under temp. meadows and pastures':'Temporary', 'Land under perm. meadows and pastures':'Permanent'}, inplace = True)
land_df= land_df[(land_df['Area'].isin(EUROPEAN_UNION.names))] # Filter to Europe
land_df.to_csv('data/land.csv', index=False)
land_df.sample(11) 
#filter to key and temporary and permanant land usage
land_df = land_df[['key', 'Temporary','Permanent']]
land_df.head()

In [None]:
#Merging  the two dataframesgives:
merged_df = pd.merge(cattle_df, land_df, on='key')
merged_df.to_csv('data/merged.csv', index=False) # write csv to data folder
merged_df 

In [None]:
#Common code commented out cell



## Price too will be an influential predictor variable so normalised clean pricing data is needed

In [10]:


price_df = pd.read_csv('rawdata/price.csv')
price_df= price_df[(price_df['Area'].isin(EUROPEAN_UNION.names))] #Reduce to European countries
price_df['key'] = price_df['Area'].astype(str) + price_df['Year'].astype(str) # key for merging later


price_df = price_df[['key', 'Area','Element Code','Element','']]

# price_df['key'] = price_df['Area'].astype(str) + price_df['Year'].astype(str)

Unnamed: 0,Domain Code,Domain,Area Code (M49),Area,Element Code,Element,Item Code (CPC),Item,Year Code,Year,Months Code,Months,Unit,Value,Flag,Flag Description,key
519,PP,Producer Prices,40,Austria,5530,Producer Price (LCU/tonne),21111.01,"Meat of cattle with the bone, fresh or chilled",1991,1991,7021,Annual value,LCU,49338.00,A,Official figure,Austria1991
520,PP,Producer Prices,40,Austria,5530,Producer Price (LCU/tonne),21111.01,"Meat of cattle with the bone, fresh or chilled",1992,1992,7021,Annual value,LCU,48602.00,A,Official figure,Austria1992
521,PP,Producer Prices,40,Austria,5530,Producer Price (LCU/tonne),21111.01,"Meat of cattle with the bone, fresh or chilled",1993,1993,7021,Annual value,LCU,46536.00,A,Official figure,Austria1993
522,PP,Producer Prices,40,Austria,5530,Producer Price (LCU/tonne),21111.01,"Meat of cattle with the bone, fresh or chilled",1994,1994,7021,Annual value,LCU,46804.00,A,Official figure,Austria1994
523,PP,Producer Prices,40,Austria,5530,Producer Price (LCU/tonne),21111.01,"Meat of cattle with the bone, fresh or chilled",1995,1995,7021,Annual value,LCU,38625.00,A,Official figure,Austria1995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8117,PP,Producer Prices,752,Sweden,5539,Producer Price Index (2014-2016 = 100),21111.01,"Meat of cattle with the bone, fresh or chilled",2017,2017,7021,Annual value,,111.11,I,Imputed value,Sweden2017
8118,PP,Producer Prices,752,Sweden,5539,Producer Price Index (2014-2016 = 100),21111.01,"Meat of cattle with the bone, fresh or chilled",2018,2018,7021,Annual value,,103.68,I,Imputed value,Sweden2018
8119,PP,Producer Prices,752,Sweden,5539,Producer Price Index (2014-2016 = 100),21111.01,"Meat of cattle with the bone, fresh or chilled",2019,2019,7021,Annual value,,102.33,I,Imputed value,Sweden2019
8120,PP,Producer Prices,752,Sweden,5539,Producer Price Index (2014-2016 = 100),21111.01,"Meat of cattle with the bone, fresh or chilled",2020,2020,7021,Annual value,,100.97,I,Imputed value,Sweden2020


In [None]:
export_df = pd.read_csv('rawdata/export.csv')
manure_df = pd.read_csv('rawdata/manure.csv')
fertiliser_df = pd.read_csv('rawdata/fertilizersnutrient.csv')
# sample testing # fertiliser_df.sample(4)# lands_df.sample(4)# price_df.sample(4)# export_df.sample(4)# manure_df.sample(4)
#Concatenate Rain Climate Change Knowledge Portal data
rain_df = files_to_df('rawdata/Eu_rain_data', 'Rainfall_mm/yr')
#Concatenate Temperature data
temp_df = files_to_df('rawdata/Eu_temp_data', 'Temperature_C')
#Write both to csv files
temp_df.to_csv('data/temp.csv', index=False)
rain_df.to_csv('data/rain.csv', index=False)
#Testing # temp_df.head(6) # rain_df.head(6)
# EU countries
countries = cattle_df['Area'].unique().tolist()
price_df