In [None]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

### Instructions 

1. This assignment relies on [pandas-datareader](https://pandas-datareader.readthedocs.io/en/latest/index.html) for the data.
2. You will need to ensure that your proxy settings are correct if used within wits.

### Question 1
Below we construct a line plot showing the relationship between crime
and murders in South Africa in $2014/2015$ for each province. The data 
is read from the csv file ```vocs2014!2015house_F1.csv```. If you don't have 
the file then download it from Sakai and place it in the same directory as your
Jupyter notebook.

In [None]:
import pandas as pd
df_crime = pd.read_csv('vocs2014!2015house_F1.csv')
df_crime['Province'] = df_crime.PROV.map({1:'Western Cape',
                                          2:'Eastern Cape',
                                          3:'Northern Cape',
                                          4:'Free State',
                                          5:'KwaZulu-Natal',
                                          6:'North West',
                                          7:'Gauteng',
                                          8:'Mpumalanga',
                                          9:'Limpopo'})
df_crime['Burglaries'] = df_crime['Q10bHous']==1
df_crime['Murders'] = df_crime['Q10fMurder']==1
df_crime['Sample'] = 1
df_crime = df_crime[['Province','Burglaries','Murders', 'Sample']].groupby('Province').sum()
df_crime['Burglaries'] /= df_crime['Sample']/100000
df_crime['Murders'] /= df_crime['Sample']/100000
df_crime.plot()
plt.show()

#### Question 1.a [5]
The above plot is not very meaningful as a line plot. Try to replicate the results 
from slides [Multivaraiate Data Visualizations, slide $9$], for South Africa. 
Has your bubble plot got the correct size bubbles?

In [None]:
# YOUR CODE HERE
raise NotImplementedError()

### Question 2
Below we fetch data from the the world bank:
1. [Life Expectancy](http://data.worldbank.org/indicator/SP.DYN.LE00.IN?view=chart)
1. [Income Per Person](http://data.worldbank.org/indicator/NY.ADJ.NNTY.PC.CD?view=chart)
1. [Population](http://data.worldbank.org/indicator/SP.POP.TOTL?view=chart)

we also get the country codes from the file ```wb_country_codes.csv``` which
can be found on Sakai.

In [None]:
# from pandas_datareader import wb
# df_tmp = wb.download(indicator=['SP.DYN.LE00.IN', 'NY.ADJ.NNTY.PC.CD','SP.POP.TOTL'], country='all', start=2012, end=2012)
# df_tmp = df_tmp.groupby(level=0).first().dropna()
# df_tmp.rename(columns={'SP.DYN.LE00.IN':'Life Expectancy',
#                                'SP.POP.TOTL':'Population',
#                                'NY.ADJ.NNTY.PC.CD':'Income Per Person' }, inplace=True)
# df_tmp.to_csv('df_life_vs_gdp.csv')
df_tmp = pd.read_csv('df_life_vs_gdp.csv', index_col='country')
df_country_codes = pd.read_csv('wb_country_codes.csv', index_col='Economy')
df_life_vs_gdp = df_tmp.join(df_country_codes)[['Life Expectancy','Income Per Person','Population','Code','Region','Income group']].dropna()

In [None]:
df_life_vs_gdp.head()

We can use the above data to emulate the charts from slides 
[Multivaraiate Data Visualizations, slide $10$ and $12$]

In [None]:
sns.set(style="whitegrid", color_codes=True)
ax = df_life_vs_gdp.plot.scatter(y='Life Expectancy', x='Income Per Person', 
                                 s=df_life_vs_gdp.Population/1e6, 
                                 cmap='tab20c', c=range(0, len(df_life_vs_gdp.index)), 
                                 alpha=0.6, figsize=(16,8))
plt.xscale('log')
plt.xlabel('Income per Person in US Dollars (GDP/capita, PPP$ inflation adjusted, log scale)')
plt.ylabel('Life Expectancy in years')
plt.show()

#### Question 2.a [5]
Unfortunately the chart lacks context. Mark the BRICS countries on the chart.

In [None]:
# YOUR CODE HERE
raise NotImplementedError()

#### Question 2.b [5]
The use of the color map is completely arbitrary. Take the BRICS marked plot and map the Region of the 
data point as the colour for each bubble. 

In [None]:
# YOUR CODE HERE
raise NotImplementedError()

### Question 3
Through some web scarping and cut and pasting we have been able to acquire some 
data for the PSL $2016/2017$. The data resides in the file ```'psl_player_stats.csv'```.
1. If you dont have the file download it from Sakai
1. Place the file in the same folder as this Jupyter Notebook.

In [None]:
df_tmp = pd.read_csv('psl_player_stats.csv')
df_goals = df_tmp[df_tmp.columns[0:5]].drop_duplicates()
df_goals.set_index('Player_Goals', inplace=True)
df_cards = df_tmp[df_tmp.columns[6:11]].drop_duplicates()
df_cards.set_index('Player_Cards', inplace=True)
df_assists = df_tmp[df_tmp.columns[12:]].drop_duplicates()
df_assists.set_index('Player_Assists', inplace=True)
df_psl = df_goals.join(df_assists).dropna().join(df_cards)[[
            'Goals', 'Penalty', 'First Goal','Appearances', 
            'Assists', 'Yellow card', 'Red card','Pts']].fillna(0)
#df_psl = (df_psl_tmp.T / df_psl_tmp.Appearances).T[[
#            'Goals', 'Penalty', 'First Goal', 
#            'Assists', 'Yellow card', 'Red card','Pts']]

In [None]:
df_psl.head()

#### Question 3.a [5]
Produce a heatmap similar to the one found in [Multivaraiate Data Visualizations, slide$-16$]. 
The one in the slides is for basketball. The field ```Pts``` in our data is a points
calculation that gives $1$ point for a Yellow Cards and $2$ points for a Red Card.
* Note: I would suggest you use Seaborn  for this task

In [None]:
# YOUR CODE HERE
raise NotImplementedError()

#### Question 3.b [5]
Another way to visualize the PSL data would be to use a parallel coordinates plot. Produce the plot below.

In [None]:
# YOUR CODE HERE
raise NotImplementedError()

### Question 4
We now have taken the entire crime data set for $2014/2015$. The data file is:
```
'vocs2014!2015.csv'
```
You can see the table of data below. Once again we have standardized the data for population size.

In [None]:
df_crime = pd.read_csv('vocs2014!2015.csv')
df_crime['Province'] = df_crime.PROV.map({1:'Western Cape',
                                          2:'Eastern Cape',
                                          3:'Northern Cape',
                                          4:'Free State',
                                          5:'KwaZulu-Natal',
                                          6:'North West',
                                          7:'Gauteng',
                                          8:'Mpumalanga',
                                          9:'Limpopo'})
df_crime['Population'] = 1
df_crime = df_crime[['Q10aCar', 'Q10cHomeRob', 'Q10dLiveStck',
       'Q10ecrops', 'Q10fMurder', 'Q10hDamageDU',
       'Q10iDamageVeh', 'Population',
       'Province']].groupby('Province').sum()
df_pop = df_crime.Population.copy()
df_crime = df_crime.div(df_crime['Population'], axis='index')
df_crime['Population'] = df_pop
df_crime

#### Question 4.a [5]
You should produce the best possible scatter matrix for this data. 

In [None]:
# YOUR CODE HERE
raise NotImplementedError()