## CA 2 - Review of Agriculture in Ireland

### Organic food production



In [1]:
# Importing the libraries

# libraries for graphics
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)

# create tables
from tabulate import tabulate

#graphics for calculations
import numpy as np
import pandas as pd

# libraries for analysis
import datetime 
import statistics
import pylab

from scipy import stats
from scipy.stats import binom
from scipy.stats import poisson




###  Local Notebook Functions 

In [None]:
def fill_empty_cells(data_frame):
    # fill an na cells with 0
    data_frame = data_frame.fillna(0)

    
def change_type(data_frame, col_name):
    # change the type to an int
    data_frame.col_name.astype(int)



### Step 1 - Import the dataframes

Dataframes imported are the overall dataset for the European Economic Community (data includes Iceland / the UK / Norway etc), the datasets for Ireland and Spain.  Spain has been selected as the comparison country based on 2020 milk production.

In [2]:
org_milk_df = pd.read_csv("Datasets/milk_df.csv")


org_milk_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 742 entries, 0 to 741
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   agriprod     742 non-null    object 
 1   Unit         742 non-null    object 
 2   Country      742 non-null    object 
 3   Year         742 non-null    int64  
 4   Value        722 non-null    float64
 5   Description  742 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 34.9+ KB


In [14]:

# convert the data type float to an int
# fill an na cells with 0
org_milk_df = org_milk_df.fillna(0)
#fill_empty_cells(data_frame)

#check on import
#org_milk_df.info()

In [15]:
irl_milk_df = pd.read_csv("Datasets/irl_milk_df.csv")

# convert the data type float to an int

# fill an na cells with 0
irl_milk_df = irl_milk_df.fillna(0)
#fill_empty_cells(data_frame)

#check on import
#irl_milk_df.info()

In [16]:
es_milk_df = pd.read_csv("Datasets/es_milk_df.csv")

# convert the data type float to an int

# fill an na cells with 0
es_milk_df = es_milk_df.fillna(0)
#fill_empty_cells(data_frame)

#check on import
#es_milk_df.info()

### Step 2 -  Descriptive Statistics in the Dataframe

#### REFERENCE - https://medium.com/analytics-vidhya/descriptive-statistics-in-data-science-with-illustrations-in-python-efd5ccc152f1

In [17]:
# show descriptive statistics relating
# to the EEC organic milk production dataframe

org_milk_df.describe()

Unnamed: 0,Year,Value
count,742.0,742.0
mean,2016.489218,116157.2
std,2.730201,214914.8
min,2012.0,0.0
25%,2014.0,2156.75
50%,2016.5,16922.5
75%,2019.0,98542.0
max,2021.0,1271853.0


In [18]:
# show descriptive statistics relating
# to the Irish organic milk production dataframe

irl_milk_df.describe()

Unnamed: 0.1,Unnamed: 0,Year,Value
count,21.0,21.0,21.0
mean,364.190476,2017.190476,7406.380952
std,225.41531,2.676174,4706.356823
min,126.0,2013.0,394.0
25%,131.0,2015.0,3238.0
50%,380.0,2017.0,7100.0
75%,618.0,2019.0,10722.0
max,623.0,2021.0,17791.0


In [19]:
# show descriptive statistics relating
# to the Spanish organic milk production dataframe

es_milk_df.describe()

Unnamed: 0.1,Unnamed: 0,Year,Value
count,30.0,30.0,30.0
mean,334.166667,2016.5,25662.9
std,204.815659,2.921384,19269.414558
min,79.0,2012.0,1071.0
25%,86.25,2014.0,12729.0
50%,342.5,2016.5,20719.5
75%,573.75,2019.0,28316.25
max,581.0,2021.0,77203.0


In [21]:
# calculate the variance of the 3 datasets

eec_variance = statistics.variance(org_milk_df['Value'])
irish_variance = statistics.variance(irl_milk_df['Value'])
spanish_variance = statistics.variance(es_milk_df['Value'])

print(eec_variance, irish_variance, spanish_variance)

46188364685.114784 22149794.54761905 371310337.4034483


### Test for a Normal Distribution

##### Reference - https://towardsdatascience.com/6-ways-to-test-for-a-normal-distribution-which-one-to-use-9dcf47d8fa93#:~:text=The%20Shapiro%20Wilk%20test%20is,for%20example%20the%20KS%20test.



TIME_PERIOD
OBS_VALUE
Dairy
Eggs
Honey
Meat
    

In [None]:
# graphic test for a normal distribution on the Irish data

stats.probplot(irl_milk_df['Value'], dist="norm", plot=pylab)
plt.title("Probability Plot - Ireland organic milk production")
pylab.show()


In [None]:
# graphic test for a normal distribution on the full EEC data

stats.probplot(org_milk_df['Value'], dist="norm", plot=pylab)
plt.title("Probability Plot - EEC organic milk production")
pylab.show()


In [None]:
# graphic test for a normal distribution on the Spanish data

stats.probplot(es_milk_df['Value'], dist="norm", plot=pylab)
plt.title("Probability Plot - Spanish organic milk production")
pylab.show()


### Shapiro Wilks Test 

In [None]:
# Normalacy test used is shapriro wilks test .. 
# both data samples are less than 50 rows, is the recommended test for this sample size
# https://statistics.laerd.com/spss-tutorials/testing-for-normality-using-spss-statistics.php


import statsmodels.api as sm
from statsmodels.formula.api import ols
from scipy import stats
import scipy as scipy

In [None]:
# Shapiro wilk test with 'Value' as target variable

irl_shapiro = stats.shapiro(es_milk_df['Value'])

#stats.shapiro(es_milk_df['Value'])

In [None]:
# Shapiro wilk test with 'Value' as target variable

#stats.shapiro(irl_milk_df['Value'])

es_shapiro = stats.shapiro(irl_milk_df['Value'])


## Step 3 - Inferential Statistics in the Dataframe

## Step 4 - Inferential Statistics in the Dataframe

## Step 5 -  Comparision using Inferential Statistics 

## STATISTICAL RESULTS

In [22]:
table = [['Statistics', 'EEC data', 'Irish data', 'Spanish data'], 
         ['Shapiro', 'TBC','irl_shapiro ', 'es_shapiro '], 
         ['Variance', 'eec_variance','irish_variance', 'spanish_variance'], 
         ['2 - TBC', 'TBC','TBC', 'TBC'], 
         ['3 - TBC', 'TBC','TBC', 'TBC'],
        ]