In [None]:
%matplotlib inline

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.set() # matplot lib defaults
plt.rcParams['figure.figsize'] = (16, 8)
%config InlineBackend.figure_format='retina'

In [None]:
# find the notebook the saved figures came from
fig_prefix='../figures/jq-'

In [None]:
# from IPython.display import IFrame
# IFrame('https://dataviz.vam.wfp.org/economic_explorer/price-forecasts-alerts?adm0=205', width=700, height=400)

In [None]:
df =pd.read_csv('../data/eac-ratin.csv')
df.tail(5)

In [None]:
# import qgrid
# qgrid.nbinstall(overwrite=True)
# qgrid.show_grid(df, remote_js=True)

Data cleaning: NaN, dtype conversion

In [None]:
df = df.rename(columns = lambda x: x.lower())
cols = df.columns.tolist()
df = df.rename(columns={cols[-3]: 'retail', cols[-2]: 'wholesale'})
df.columns.tolist()

In [None]:
df.dtypes

In [None]:
df['wholesale'].sort_values().unique()

In [None]:
df[df['wholesale']=='Wholesale'] = np.NaN
df['wholesale']=df['wholesale'].astype('float')

In [None]:
df['retail'].sort_values().unique()

In [None]:
df[df['retail']=='NaN'] = np.NaN
df['retail'] = df['retail'].astype('float')
df['retail'].sort_values().unique()

In [None]:
df['date'] = pd.to_datetime(df['date'])

In [None]:
df.dtypes

In [None]:
str_cols = ['market', 'product', 'country', 'currency']
df[str_cols].describe()

for item in str_cols:
    df[item]=df[item].astype('category') # which will by default set the length to the max len it encounters
    print(df[item].unique())

In [None]:
df.dtypes

In [None]:
df['year']=df['date'].dt.year
df['month']=df['date'].dt.month
df['day']=df['date'].dt.day
df['week']=df['date'].dt.week
# note that since there are nan in the date colunms, let the dtype remain float

In [None]:
df.head(3)

In [None]:
np.nan


In [None]:
# replace zeros with NaN

cols = ['wholesale', 'retail']
df[cols] = df[cols].replace({0:np.nan})

# cond = (df['wholesale']==0)
# df_zero = df[cond]
# df_zero['wholesale'] = np.NaN
# df['wholesale']=df_zero['wholesale']


In [None]:
# return 1 to confirm no zero values
np.prod(df['wholesale']!=0)

In [None]:
# remove outliers:
def remove_outliers(x):
    '''x: pandas series'''
    lower_bound, upper_bound = x.quantile(.05), x.quantile(.95)
    x = x[x.between(lower_bound, upper_bound)]
    return x

df['wholesale'] = remove_outliers(df['wholesale'])
df['retail'] = remove_outliers(df['retail'])

exploratory visualization:

In [None]:
# save a copy of cleaned data to csv
df.to_csv('../data/cleaned_data.csv', index=False)

In [None]:
# save to db file
from sqlalchemy import create_engine
engine = create_engine('sqlite:///../data/mydb.db', echo=False)
df.to_sql('data', con=engine, if_exists='replace',
           index_label='id')