In [1]:
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
import math
import re
import os
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import plotly
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
!pwd

/Users/shijunyi/Desktop/学习资料/U of Melbourne/2019 Sem2/Data Science Project/[6] code/python


In [2]:
pd.options.display.precision = 2
pd.options.display.max_columns = 50
matplotlib.rcParams['figure.figsize'] = [15, 10]
init_notebook_mode(connected=True)
import warnings
warnings.filterwarnings("ignore")

In [3]:
#read data
dataset = pd.read_csv('data/for_uom_go8_corr.csv')
src_countries = ['China', 'India', 'Indonesia', 'Malaysia']
final_result_df = pd.DataFrame(
                            columns = ['Source', 'Count Name', 'Count Type', 'Predictor', 
                                       'Shift', 'Correlation', 'P-value', 'Tested Corralation']
                              )

In [4]:
for src in src_countries:
    # filter rows from one source country:
    factors_df = dataset.loc[dataset['source_country'] == src]

    #shift predictors by years:
    factors_df.sort_values('year',inplace=True)
    cols = factors_df.columns
    factors_df = factors_df.join(factors_df[cols].shift(1), rsuffix='_1y_shift')
    factors_df = factors_df.join(factors_df[cols].shift(2), rsuffix='_2y_shift')
    factors_df = factors_df.join(factors_df[cols].shift(3), rsuffix='_3y_shift')
    factors_df = factors_df.join(factors_df[cols].shift(4), rsuffix='_4y_shift')
    factors_df = factors_df.join(factors_df[cols].shift(5), rsuffix='_5y_shift')

    #calculate additional student count
    factors_df['AUS_no_GO8_count'] = factors_df['Australia_count'] - factors_df['GO8_count']
    factors_df['GO8_no_UOM_count'] = factors_df['GO8_count'] - factors_df['UOM_count']
    factors_df['UOM/GO8'] = factors_df['UOM_count'] / factors_df['GO8_count']
    factors_df['GO8/AUS'] = factors_df['GO8_count'] / factors_df['Australia_count']


    #rearrange cols:
    new_col = ['UOM_count', 'GO8_count', 'Australia_count', 'AUS_no_GO8_count', 'GO8_no_UOM_count', 'UOM/GO8', 'GO8/AUS', 
     'year', 'year_1y_shift', 'year_2y_shift', 'year_3y_shift', 'year_4y_shift', 'year_5y_shift',
     'src_population', 'src_population_1y_shift', 'src_population_2y_shift', 'src_population_3y_shift', 'src_population_4y_shift', 'src_population_5y_shift',
     'src_qoe_top20', 'src_qoe_top20_1y_shift', 'src_qoe_top20_2y_shift',
     'src_qoe_top20_3y_shift', 'src_qoe_top20_4y_shift', 'src_qoe_top20_5y_shift',
     'src_qoe_top100', 'src_qoe_top100_1y_shift', 'src_qoe_top100_2y_shift',
     'src_qoe_top100_3y_shift', 'src_qoe_top100_4y_shift', 'src_qoe_top100_5y_shift',
     'src_qoe_top200', 'src_qoe_top200_1y_shift', 'src_qoe_top200_2y_shift',
     'src_qoe_top200_3y_shift', 'src_qoe_top200_4y_shift', 'src_qoe_top200_5y_shift',
     'src_qoe_top500', 'src_qoe_top500_1y_shift', 'src_qoe_top500_2y_shift',
     'src_qoe_top500_3y_shift', 'src_qoe_top500_4y_shift', 'src_qoe_top500_5y_shift',
     'src_gni', 'src_gni_1y_shift', 'src_gni_2y_shift', 
     'src_gni_3y_shift', 'src_gni_4y_shift', 'src_gni_5y_shift', 
     'src_gdp', 'src_gdp_1y_shift', 'src_gdp_2y_shift', 
     'src_gdp_3y_shift', 'src_gdp_4y_shift', 'src_gdp_5y_shift', 
     'src_rate_of_one_usd', 'src_rate_of_one_usd_1y_shift', 'src_rate_of_one_usd_2y_shift', 
     'src_rate_of_one_usd_3y_shift', 'src_rate_of_one_usd_4y_shift', 'src_rate_of_one_usd_5y_shift', 
     'src_tertiary_enrolment', 'src_tertiary_enrolment_1y_shift', 'src_tertiary_enrolment_2y_shift', 
     'src_tertiary_enrolment_3y_shift', 'src_tertiary_enrolment_4y_shift', 'src_tertiary_enrolment_5y_shift'
    ]
    factors_df = factors_df[new_col]
    #save the factor table:
    factors_df.to_csv(path_or_buf = 'output/UOM_GO8_factors_'+src+'.csv')
    
    #counts:
    counts = factors_df.columns.values[0:7]
    original_counts = counts[0:3]
    transformed_counts = counts[3:5]
    percentage_counts = counts[5:]
    
    #predictors:
    predictors_with_shift = factors_df.columns.values[7:]
    i = 0
    predictors = {
        0:[],
        1:[],
        2:[],
        3:[],
        4:[],
        5:[]}
    for pred in predictors_with_shift:
        predictors[i].append(pred)
        if i != 5:
            i += 1
        else:
            i = 0
    
    #prepare empty tables:
    pvalue_df = pd.DataFrame(
                            columns = counts,
                            index = predictors_with_shift)
    corr_df = pd.DataFrame( 
                            columns = counts,
                            index = predictors_with_shift)
    tested_df = pd.DataFrame( 
                            columns = counts,
                            index = predictors_with_shift)
    
    #calculating corr and pvalue:
    for cnt in counts:
        for k, lst in predictors.items():
            for pred in lst:
                #calculate:
                c_p_pair = pearsonr(factors_df[cnt][k:],factors_df[pred][k:])
                corr = c_p_pair[0]
                pvalue = c_p_pair[1]
                
                #fill corr table and pvalue table:
                pvalue_df.at[pred, cnt] = pvalue
                corr_df.at[pred, cnt] = corr
                if pvalue is None or pvalue > 0.05:  #cannot reject H0, the corr is not sig
                    tested_corr = np.nan
                else:
                    tested_corr = corr
                tested_df.at[pred, cnt] = tested_corr
                
                #fill final_result_df:
                if cnt in original_counts:
                    typ = 'original'
                elif cnt in transformed_counts:
                    typ = 'transformed'
                else:
                    typ = 'percentage'
                
                if k == 0:
                    pred_name = pred
                else:
                    sep = '_'+str(k)+'y'
                    pred_name = pred.split(sep)[0]
                    
                final_result_df = final_result_df.append(
                    pd.DataFrame({'Source':src,
                                  'Count Name': cnt,
                                  'Count Type': typ,
                                  'Predictor':pred_name,
                                  'Shift': k,
                                  'Correlation': corr,
                                  'P-value': pvalue,
                                  'Tested Corralation': tested_corr
                                 }, index = [0]),
                    ignore_index=True)

                
    #save the corraltion table, p-value table:
    #corr_df.to_csv(path_or_buf = 'output/UOM_GO8_correlation_'+src+'.csv')
    #pvalue_df.to_csv(path_or_buf = 'output/UOM_GO8_pvalue_'+src+'.csv')
    
#save finel result table:
final_result_df.to_csv(path_or_buf = 'output/correlation_table_UOM_GO8_AUS.csv')
    

    #plot:
    trace = go.Heatmap(z=corr_df.values,
                       x=corr_df.columns.values,
                       y=corr_df.index.values,
                       colorscale='spectral',
                       reversescale=True)
    
    layout = go.Layout(
    title="Correlation Between Predcitors and Student Count from "+src,
    autosize=False,
    width=1000,
    height=2000,
    yaxis = dict(visible  = True,
                constraintoward = 'left')
    )


    fig = go.Figure(data=trace, layout=layout)
    iplot(fig)