In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Read CSV and Display Options

In [None]:
df=pd.read_csv("/kaggle/input/allenunger-global-commodity-prices/all_commodities.csv")

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows',None)
#col width option is not necessary, but can be usefull for looking at large amounts of text in columns such as sources.
#pd.set_option('max_colwidth', -1)

# Establish DataFrames for Wheat Prices


In [None]:
#I want to analyze Wheat in the 100 years from 1801-1900  
#Using loc indexing in combination with boolean expressions returns that snapshot
wheat_df=df.loc[(df.Commodity == "Wheat")& (df['Item Year'] >1800)& (df['Item Year'] <1901)]
wheat_df.head()

In [None]:
#I want to check which locations have the most data for wheat
wheat_counts=wheat_df.groupby(["Location","Commodity"]).size().reset_index(name="Wheat_Counts").sort_values(by="Wheat_Counts",ascending=False)
wheat_counts.head()

In [None]:
#I have identified locations that have 85 or more Data points for wheat in this time period
#I want to take a look at if there will be potential issues with having different sources or varieties before proceeding.
#I also want to make sure the data is continuous if possible. If not I will have to fill in with appropriate method

# Porto, Portugal 1801-1854 two sets of data 2 sources, variety NA, too many missing data points,45, to continue with
#wheat_df[wheat_df.Location=="Porto"]

# Low, Ukraine 1801-1900 variety NA
#wheat_df[wheat_df.Location=="Lwow"]

# Tours France variety NA 1801-1900 variety NA
#wheat_df[wheat_df.Location=="Tours"]

# england/southengland dataset 1800-1900 variety NA for both and I will investigate before combining
#wheat_df[wheat_df.Location=="England"]
#wheat_df[wheat_df.Location=="Southern England"]

# Arnhem, Netherlands 1800-1900 variety NA
#wheat_df[wheat_df.Location=="Arnhem"]

#Ghent, Belgium is NA variety and 1816-1900
#wheat_df[wheat_df.Location=="Ghent"]

#Krakow,Poland variety NA 1800-1900 2 different sources, so I will have to investigate before combining
#wheat_df[wheat_df.Location=="Krakow"]

In [None]:
#Making dataframes for cities. At this point both England locations and both Krakow sources are seperate
#England
england=wheat_df.loc[(wheat_df.Location=="England")].reset_index()
southern_england=wheat_df.loc[(wheat_df.Location=="Southern England")].reset_index()
#Poland
krakow_unger=wheat_df.loc[(wheat_df.Location=="Krakow") & (wheat_df.Sources=="(Richard Unger)")].reset_index()
krakow_gorkiewicz=wheat_df.loc[(wheat_df.Location=="Krakow") & (wheat_df.Sources!="(Richard Unger)")].reset_index()
#France
tours=wheat_df.loc[(wheat_df.Location=="Tours")].reset_index()
#Belgium
ghent=wheat_df.loc[(wheat_df.Location == "Ghent")].reset_index()
#Netherlands
arnhem=wheat_df.loc[(wheat_df.Location == "Arnhem")].reset_index()
#Ukraine
lwow=wheat_df.loc[wheat_df.Location=="Lwow"].reset_index()

# Combine Data for England and Krakow

In [None]:
krakow_combined=krakow_gorkiewicz.copy()
krakow_combined["Standard Value"]=(krakow_gorkiewicz["Standard Value"] + krakow_unger["Standard Value"])/2
krakow_combined.head()

In [None]:
england_combined=england.copy()
england_combined["Standard Value"]=(england["Standard Value"] + southern_england["Standard Value"])/2
england_combined.head()

# Calculate the Percent Change Relative to 1816

In [None]:
#I want to calculate the Wheat price % change per year relative to the first year of data that all locations have, 1816
#This could prove usefull both for filling in missing data points for Ghent which is missing values before 1816
#It can be more resistant to violatility than a standard mean calculation in some cases.

#Add a column for % change by year relative to the FIRST year by accessing the zero index value in "Standard Value" column
england_combined["england_pct_change"] = england_combined["Standard Value"] / england_combined[england_combined['Item Year'] == 1816]['Standard Value'].iat[0]-1
tours["tours_pct_change"] = tours["Standard Value"] / tours[tours['Item Year'] == 1816]['Standard Value'].iat[0]-1
krakow_combined["krakow_pct_change"] = krakow_combined["Standard Value"] / krakow_combined[krakow_combined['Item Year'] == 1816]['Standard Value'].iat[0]-1
lwow["lwow_pct_change"] = lwow["Standard Value"] / lwow[lwow['Item Year'] == 1816]['Standard Value'].iat[0]-1
arnhem["arnhem_pct_change"] = arnhem["Standard Value"] / arnhem[arnhem['Item Year'] == 1816]['Standard Value'].iat[0]-1
ghent["ghent_pct_change"] = ghent["Standard Value"] / ghent[ghent['Item Year'] == 1816]['Standard Value'].iat[0]-1
england_combined.head()

In [None]:
#Creating this new dataframe will streamline the aggregation process
england_combined2=england_combined[["england_pct_change","Item Year"]].copy() 
tours2=tours[["tours_pct_change","Item Year"]].copy() 
krakow_combined2=krakow_combined[["krakow_pct_change","Item Year"]].copy() 
lwow2=lwow[["lwow_pct_change","Item Year"]].copy()  
arnhem2=arnhem[["arnhem_pct_change","Item Year"]].copy()  
ghent2=ghent[["ghent_pct_change","Item Year"]].copy()


In [None]:
#Combine all the relevant data
dflist=[england_combined2,krakow_combined2,lwow2,tours2,arnhem2,ghent2]
dfs = [df.set_index('Item Year') for df in dflist]
Europe= pd.concat(dfs, axis=1)
#Europe= pd.concat(dflist, axis=1)
Europe["avg_pct_change"]=Europe.agg("mean",axis=1)
Europe=Europe.reset_index()
Europe.head()

# Impute Missing Values for Ghent

In [None]:
#I now have reasonable values for the missing values of Ghent

ghent3 = pd.merge(Europe[['Item Year', 'avg_pct_change']], ghent, how='outer', on=['Item Year'])
ghent3.loc[ghent3['Standard Value'].isnull(), 'Standard Value'] = (1+ghent3['avg_pct_change']) * ghent3[ghent3['Item Year'] == 1816]['Standard Value'].iat[0]
#Need to fill in percent change for ghent as well

ghent3["ghent_pct_change"] = ghent3["Standard Value"] / ghent3[ghent3['Item Year'] == 1801]['Standard Value'].iat[0]-1
#ghent3=ghent3.drop(["level_0","index"],axis=1)
ghent3["Commodity"].fillna("Wheat",inplace=True)
ghent3["Location"].fillna("Ghent",inplace=True)
ghent3["Original Currency"].fillna("Belgian Franc",inplace=True)
ghent3["Standard Currency"].fillna("Silver",inplace=True)
ghent3["Orignal Measure"].fillna("Kilogram",inplace=True)
ghent3["Standard Measure"].fillna("Kilogram",inplace=True)
ghent3["Sources"].fillna("(G. Avondts-P. Scholliers) (De Gentse Textielarbeiders in de 19e en 20e Eeuw dossier 5) (Brussels: Centrum voor Hedendaagse Sociale Geschiedenis-1977)",inplace=True)
ghent3["Notes"].fillna("(gpih.ucdavis.edu)-(D.S.Jacks_2001-P.H. Lindert_2008-J.W.Ambrosin_2007)",inplace=True)

ghent3.head()

# Build 95% Confidence Intervals

In [None]:
#england_combined=england_combined.reset_index()
england_combined['20_yr_rolling_avg_price'] = england_combined["Standard Value"].rolling(window=20).mean()
england_combined['20_yr_rolling_std_price'] = england_combined['Standard Value'].rolling(window=20).std()
england_combined['2_std_price_decline'] = england_combined['20_yr_rolling_avg_price'] - 2 * england_combined['20_yr_rolling_std_price']
england_combined['2_std_price_rise'] = england_combined['20_yr_rolling_avg_price'] + 2 * england_combined['20_yr_rolling_std_price']
england_combined.loc[england_combined['Standard Value'] >= england_combined['2_std_price_rise'], '95_pct_sig_move'] = 'sig_price_increase'
england_combined.loc[england_combined['Standard Value'] <= england_combined['2_std_price_decline'], '95_pct_sig_move'] = 'sig_price_decrease'

england_combined.head()

# Recalculate Avg Percent Change for Europe 1801-1900


In [None]:
england_combined3=england_combined.copy() 
tours3=tours.copy() 
krakow_combined3=krakow_combined.copy() 
lwow3=lwow.copy()  
arnhem3=arnhem.copy()  
ghent4=ghent3.copy()

In [None]:
#Same idea as above, but I need this to be with respect to 1801 this time.
england_combined3["england_pct_change"] = england_combined["Standard Value"] / england_combined[england_combined['Item Year'] == 1801]['Standard Value'].iat[0]-1
tours3["tours_pct_change"] = tours["Standard Value"] / tours[tours['Item Year'] == 1801]['Standard Value'].iat[0]-1
krakow_combined3["krakow_pct_change"] = krakow_combined["Standard Value"] / krakow_combined[krakow_combined['Item Year'] == 1801]['Standard Value'].iat[0]-1
lwow3["lwow_pct_change"] = lwow["Standard Value"] / lwow[lwow['Item Year'] == 1801]['Standard Value'].iat[0]-1
arnhem3["arnhem_pct_change"] = arnhem["Standard Value"] / arnhem[arnhem['Item Year'] == 1801]['Standard Value'].iat[0]-1
#ghent2["ghent_pct_change"] = ghent3["Standard Value"] / ghent3[ghent3['Item Year'] == 1801]['Standard Value'].iat[0]-1





In [None]:
england_combined3=england_combined3[["england_pct_change","Item Year"]] 
tours3=tours3[["tours_pct_change","Item Year"]] 
krakow_combined3=krakow_combined3[["krakow_pct_change","Item Year"]] 
lwow3=lwow3[["lwow_pct_change","Item Year"]]  
arnhem3=arnhem3[["arnhem_pct_change","Item Year"]]  
ghent3=ghent3[["ghent_pct_change","Item Year"]]


In [None]:
#With respect to the year 1801 I now have the percent differences and average percent difference for Europe.
dflist2=[england_combined3,krakow_combined3,lwow3,tours3,arnhem3,ghent3]
dfs2 = [df.set_index('Item Year') for df in dflist2]
Europe_1801= pd.concat(dfs2, axis=1)

Europe_1801["avg_pct_dif"]=Europe_1801.agg("mean",axis=1)

Europe_1801=Europe.reset_index()
Europe_1801.head()