In [None]:
import numpy as np
import pandas as pd
pd.options.display.precision = 2
pd.options.display.max_columns = 50

In [None]:
import re
import os
os.chdir('../../')

In [None]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams['figure.figsize'] = [15, 10]

In [None]:
#import seaborn as sns
import plotly
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

# Read Predictors data

In [None]:
predictors_df = pd.read_csv('data/Primary_Commodity_Price_System_PCPS.csv', skiprows=1)

In [None]:
predictors_df['year'] = predictors_df.Date.apply(lambda x: int(x.split(' ')[1]))
#predictors_df.head()

In [None]:
predictors_yearly_df = predictors_df.groupby('year').mean()
cols = predictors_yearly_df.columns
predictors_yearly_df = predictors_yearly_df.join(predictors_yearly_df[cols].shift(), rsuffix='_1y_shift')
predictors_yearly_df = predictors_yearly_df.join(predictors_yearly_df[cols].shift(2), rsuffix='_2y_shift')
predictors_yearly_df = predictors_yearly_df.loc[2003:2018]

In [None]:
predictors_yearly_df.head()

# Read response data (Sponsored Students Counts)

In [None]:
response_df = pd.read_csv("data/sponsor_data_countrywise_backup.csv").set_index('country')
response_df.columns.name = 'year'
response_df = response_df.stack()
response_df.name = 'students_count'
response_df = response_df.to_frame().reset_index()
response_df['year'] = response_df.year.astype(int)
response_df.to_csv('data/sponsored_students_counts.csv', index=False)

In [None]:
mask = response_df.country.apply(lambda x: re.search('.*Total.*', x) is None)
response_df = response_df[mask]

In [None]:
years_df = response_df.groupby('year').students_count.sum().sort_index()

traces = []
traces.append(go.Bar(
        x=years_df.index.values,
        y=years_df.values,
        name='Sponsored Student Count',
        opacity = 0.5))


layout = dict(
    title = 'Total Number of Sponsored Students By Year',
    xaxis = dict(title = 'Year'),
    
    yaxis = dict(
        title = 'Number of Sponsored Students',
        #range = [0, 250]
    ),
    
    legend=dict(
        x=0.3,
        y=1.1,
        traceorder='normal',
        font=dict(
            family='sans-serif',
            size=12,
            color='#000'
        ),
        orientation="h"
    )
,
)

fig = dict(data=traces, layout=layout)
iplot(fig)

In [None]:
top_countries_df = response_df.groupby('country').students_count.sum().sort_values(ascending=False).head(30)
top_countries = top_countries_df.index.values

In [None]:
traces = []
traces.append(go.Bar(
        x=top_countries_df.index.values,
        y=top_countries_df.values,
        name='Sponsored Student Count',
        opacity = 0.5))


layout = dict(
    title = 'Total Number of Sponsored Students By Top 30 Countries',
    xaxis = dict(title = 'Year'),
    
    yaxis = dict(
        title = 'Number of Sponsored Students',
        #range = [0, 250]
    ),
    
    legend=dict(
        x=0.3,
        y=1.1,
        traceorder='normal',
        font=dict(
            family='sans-serif',
            size=12,
            color='#000'
        ),
        orientation="h"
    )
,
)

fig = dict(data=traces, layout=layout)
iplot(fig)

In [None]:
training_set = response_df.join(predictors_yearly_df, on='year')
training_set.to_csv('data/training_set.csv')

In [None]:
training_set.set_index(['country', 'year'], inplace=True)

# Visualise

## Correlation

In [None]:
df = response_df.set_index(['country', 'year']).unstack('country')
df.columns = df.columns.droplevel(0)
df = df.loc[:2018][top_countries]

In [None]:
predictors_yearly_df.join(df)

In [None]:
correlation_df = predictors_yearly_df.join(df).corr()
correlation_df = correlation_df[predictors_yearly_df.columns].loc[list(df.columns.values)]
correlation_df

## Heatmap

In [None]:
df = correlation_df.loc[correlation_df.index.values[::-1]]
trace = go.Heatmap(z=df.values,
                   x=df.columns.values,
                   y=df.index.values)

layout = go.Layout(
    title="Correlation Between the Number of Sponsored Students and Commodity Indices",
    autosize=False,
    width=900,
    height=900)

fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

## Comparison Chart

In [None]:
predictors = ['Index',
              'Energy index',
              'Non-Fuel index',
              #'Coal index', 'Natural gas index',
       #'APSP crude oil_USD_per_bbl', 'Propane', 'Industrial Materials index',
       #'Food and beverage index', 'Fertilizer'
             ]

In [None]:
response_df

In [None]:
traces = []

# Adding predictor lines
for predictor in predictors:
    traces.append(
        go.Scatter(
            x=predictors_yearly_df.index.values,
            y=predictors_yearly_df[predictor].values,
            name=predictor,
            mode='lines'
        ),
    )

# Adding response bar-chart by country
#countries = np.sort(response_df.country.unique())
countries = top_countries
visibility = True
for country in countries:
    df = response_df[response_df.country == country]
    x = df.year.values
    traces.append(go.Bar(
        x=x,
        y=df.students_count,
        yaxis='y2',
        visible=visibility,
        name='Sponsored Student Count',
        opacity = 0.5))
    visibility = False


# Adding dropdown menu    
buttons = []
pr_count = len(predictors)
for idx, county in enumerate(countries):
    mask = [True] * pr_count + [False] * len(countries)
    mask[pr_count + idx] = True
    buttons.append(
        dict(
            label = county,
            method = 'update',
            args = [{'visible': mask}]
        )
    )
    
    
layout = dict(
    title = 'Trends in Commodity Prices and Number of Sponsored Students by Country',
    xaxis = dict(title = 'Year'),
    
    yaxis = dict(
        title = 'Commodity Index Value',
        range = [0, 250]
    ),
    
    yaxis2 = dict(
        title='Sponsored Student Count',
        overlaying='y',
        side='right',
        range=[0,800]
        #position=0.15
    ),
    updatemenus = list([dict(
            active=0,
            pad = {'r': 0, 't': 0},
            x = 0.0,
            xanchor = 'left',
            y = 1.1,
            yanchor = 'top',
            buttons=buttons)]),
    legend=dict(
        x=0.3,
        y=1.1,
        traceorder='normal',
        font=dict(
            family='sans-serif',
            size=12,
            color='#000'
        ),
        orientation="h"
    )
,
)


fig = dict(data=traces, layout=layout)

iplot(fig)