# 1. Data collection

In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#plotly
import plotly.plotly as py
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
from plotly import tools
import plotly.graph_objs as go


# matplotlib
import matplotlib.pyplot as plt

In [7]:
cwur = pd.read_csv("world-university-rankings/cwurData.csv")
cwur.head()

Unnamed: 0,world_rank,institution,country,national_rank,quality_of_education,alumni_employment,quality_of_faculty,publications,influence,citations,broad_impact,patents,score,year
0,1,Harvard University,USA,1,7,9,1,1,1,1,,5,100.0,2012
1,2,Massachusetts Institute of Technology,USA,2,9,17,3,12,4,4,,1,91.67,2012
2,3,Stanford University,USA,3,17,11,5,4,2,2,,15,89.5,2012
3,4,University of Cambridge,United Kingdom,1,10,24,4,16,16,11,,50,86.17,2012
4,5,California Institute of Technology,USA,4,2,29,7,37,22,22,,18,85.21,2012


In [None]:
#Prepare dataset for cwur
cwur12_kr = cwur.loc[(cwur['year'] == 2012) & (cwur['country']=='South Korea')]
cwur13_kr = cwur.loc[(cwur['year'] == 2013) & (cwur['country']=='South Korea')]
cwur14_kr = cwur.loc[(cwur['year'] == 2014) & (cwur['country']=='South Korea')]
cwur15_kr = cwur.loc[(cwur['year'] == 2015) & (cwur['country']=='South Korea')]

In [None]:
cwur12_kr

In [None]:
cwur13_kr

> Since the data of 2012 and 2013 in South Korea has only one, I will drop it to analysis

In [None]:
#sort by national rank for each year
cwur14_kr.sort_values(by="national_rank", ascending=False)
cwur15_kr.sort_values(by="national_rank", ascending=False)
cwur15_kr.head()

# 2. Domestic measures

> ## Top 10 Universities in Korea during 2014 - 2015

In [None]:
# create trace1 
trace1 = go.Bar(
                x = cwur14_kr.institution.head(15),
                y = cwur14_kr.national_rank.head(15),
                name = "2014",
                marker = dict(color = 'rgba(255, 174, 255, 0.5)',
                             line=dict(color='rgb(0,0,0)',width=1.5)),
                text = cwur14_kr.world_rank)
# create trace2 
trace2 = go.Bar(
                x = cwur15_kr.institution.head(15),
                y = cwur15_kr.national_rank.head(15),
                name = "2015",
                marker = dict(color = 'rgba(255, 255, 128, 0.5)',
                              line=dict(color='rgb(0,0,0)',width=1.5)),
                text = cwur15_kr.world_rank
                )
layout = go.Layout(
    title='Top 10 universities in South Korea 2014-2015 by CWUR',
    xaxis=dict(
        tickfont=dict(
            size=11,
            color='rgb(107, 107, 107)'
        )
    ),
    yaxis=dict(
        title='National rank',
        titlefont=dict(
            size=14,
            color='rgb(107, 107, 107)'
        )
    ),
    barmode='group',
    bargap=0.2,
    bargroupgap=0.2
)

data = [trace1, trace2]
fig = go.Figure(data = data, layout = layout)
iplot(fig)

It seems like not much changes for Top 10 while EWU and CNU has swapped each other for the 10th place.<br/>
Let's dive into each year of Top 10 universities to see the competitveness in all over the world

# 3. Comparisonal measures 

> ## Worldwide

Let's see the number of universities out of top 400 by each country to figure out which position korea is located in.

In [None]:
#prepare the top 400 Universities in each year
top400_14 = cwur.loc[(cwur['year'] == 2014) & (cwur['world_rank'] < 401)]
top400_15 = cwur.loc[(cwur['year'] == 2015) & (cwur['world_rank'] < 401)]

In [None]:
#count out of top 400 by country in 2014                   
count_by_country14 = top400_14.groupby('country')['world_rank'].count()
count_by_country14.sort_values(na_position='last', inplace=True, ascending=False)

#count out of top 400 by country in 2015                   
count_by_country15 = top400_15.groupby('country')['world_rank'].count()
count_by_country15.sort_values(na_position='last', inplace=True, ascending=False)

#여기까지 시리즈
count_by_country15.head()

Let's create multiple charts to figure out easily!

In [None]:
count_14 = count_by_country14.head(10)
count_15 = count_by_country15.head(10)

#to list
y_country14 = count_14.index.tolist()
y_country15 = count_15.index.tolist()
x_count14 = count_14.values.tolist()
x_count15 = count_15.values.tolist()

In [None]:
#trace0_2014
trace0 = go.Bar(
                x=x_count14,
                y=y_country14,
                marker=dict(color='rgba(171, 50, 96, 0.6)'),
                name='2014',
                orientation='h',
)

#trace1_2015
trace1 = go.Bar(
                x=x_count15,
                y=y_country15,
                marker=dict(color='rgba(12, 50, 196, 0.6)'),
                name='2015',
                orientation='h',
)

#layout
layout = dict(
                title='The number of Universitity in Top 400 by country',
                yaxis=dict(showticklabels=True,domain=[0, 0.85],autorange='reversed'),
                yaxis2=dict(showline=True,showticklabels=False,linecolor='rgba(102, 102, 102, 0.8)',linewidth=2,domain=[0, 0.85],autorange='reversed'),
                xaxis=dict(zeroline=False,showline=False,showticklabels=True,showgrid=True),
                xaxis2=dict(zeroline=False,showline=False,showticklabels=True,showgrid=True),
                margin=dict(l=200, r=20,t=70,b=70),
                paper_bgcolor='rgb(248, 248, 255)',
                plot_bgcolor='rgb(248, 248, 255)',
)

# Creating two subplots
fig = tools.make_subplots(rows=1, cols=2)
fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 1, 2)

fig['layout'].update(layout)
iplot(fig)

In [None]:
#prepare the top 400 Universities in each year
top100_14 = cwur.loc[(cwur['year'] == 2014) & (cwur['world_rank'] < 101)]
top100_15 = cwur.loc[(cwur['year'] == 2015) & (cwur['world_rank'] < 101)]

#count out of top 100 by country in 2014                   
count100_by_country14 = top100_14.groupby('country')['world_rank'].count()
count100_by_country14.sort_values(na_position='last', inplace=True, ascending=False)

#count out of top 100 by country in 2015                   
count100_by_country15 = top100_15.groupby('country')['world_rank'].count()
count100_by_country15.sort_values(na_position='last', inplace=True, ascending=False)

count100_14 = count100_by_country14.head(12)
count100_15 = count100_by_country15.head(12)

#to list
y100_country14 = count100_14.index.tolist()
y100_country15 = count100_15.index.tolist()
x100_count14 = count100_14.values.tolist()
x100_count15 = count100_15.values.tolist()

#trace0_2014
trace0 = go.Bar(
                x=x100_count14,
                y=y100_country14,
                marker=dict(color='rgba(171, 50, 96, 0.6)'),
                name='2014',
                orientation='h',
)

#trace1_2015
trace1 = go.Bar(
                x=x100_count15,
                y=y100_country15,
                marker=dict(color='rgba(12, 50, 196, 0.6)'),
                name='2015',
                orientation='h',
)

#layout
layout = dict(
                title='The number of Universitity in Top 100 by country',
                yaxis=dict(showticklabels=True,domain=[0, 0.85],autorange='reversed'),
                yaxis2=dict(showline=True,showticklabels=False,linecolor='rgba(102, 102, 102, 0.8)',linewidth=2,domain=[0, 0.85],autorange='reversed'),
                xaxis=dict(zeroline=False,showline=False,showticklabels=True,showgrid=True),
                xaxis2=dict(zeroline=False,showline=False,showticklabels=True,showgrid=True),
                margin=dict(l=200, r=20,t=70,b=70),
                paper_bgcolor='rgb(248, 248, 255)',
                plot_bgcolor='rgb(248, 248, 255)',
)

# Creating two subplots
fig = tools.make_subplots(rows=1, cols=2)
fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 1, 2)

fig['layout'].update(layout)
iplot(fig)


 
> ## Comparison with Japan and China**

Korea is the 9th of the number of universities in Top 400 of the world.<br/>
In compared with other asian countries like China and Japan, S.Korea is behind of them.<br/>

#### # of universities on Top 400 in the world
* South Korea 
    * 12 -> 11
* Chinea 
    * 13 -> 16
* Japan 
    * 18 -> 15
    
#### # of universities on Top 100 in the world
* South Korea 
    * 1 -> 2
* Chinea 
    * 2 -> 2
* Japan 
    * 8 -> 7

In general, SNU(Seoul National University), PU(Peking University), UT(University of Tokyo) are well-known university in each country.<br/>
Let's compare those universities.

> ## Top university of South Korea, Japan and China

In [None]:
# prepare data frame
pu = cwur[cwur['institution'] =='Peking University']
snu = cwur[cwur['institution'] =='Seoul National University']
ut = cwur[cwur['institution'] =='University of Tokyo']

# Creating trace1
trace1 = go.Scatter(
                    x = pu.year,
                    y = pu.score,
                    mode = "lines+markers",
                    name = "Peking University(CHN)",
                    marker = dict(color = 'rgb(171, 50, 96)'),
                    text= pu.world_rank)
# Creating trace2
trace2 = go.Scatter(
                    x = snu.year,
                    y = snu.score,
                    mode = "lines+markers",
                    name = "Seoul National University(KOR)",
                    marker = dict(color = 'rgb(50, 96, 171)'),
                    text= snu.world_rank)

trace3 = go.Scatter(
                    x = ut.year,
                    y = ut.score,
                    mode = "lines+markers",
                    name = "University of Tokyo(JPN)",
                    marker = dict(color = 'rgb(50, 171, 96)'),
                    text= ut.world_rank)

data = [trace1, trace2, trace3]
layout = dict(title = 'Top univerisity of S.Korea, Japan and China',
              xaxis= dict(title= 'Year',zeroline= False,dtick=1),
              yaxis= dict(title= 'Score',zeroline= False)
             )

fig = dict(data = data, layout = layout)
iplot(fig)


Even though University of Tokyo's rank is way higher than SNU and PU, SNU is getting higher for the past 4 years.

In [None]:
pu.head()

In [None]:
snu.head()

In [None]:
ut.head()

* ### Conclusion

In terms of world ranking, south kore is leaving behind of Japan and China out of Top 100.<br/>
It's needed to be digging into which sector affects to the result. <br/>

In case of Korean univerisities, there is not much difference between 2014 and 2015
