In [109]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
%matplotlib inline

## Loading the data

In [6]:
wru_ranking = pd.read_csv("Word Rank University.csv")

### Viewing the dataframde

In [7]:
wru_ranking.head(10)

Unnamed: 0,World Rank,University Names,Location,National Rank,Educational Rank,Employability Rank,Faculty Rank,Research Rank,Score
0,1Top 0.1%,Harvard University\r\n CWUR Rating System: ...,USA,1,1,1,1,1,100.0
1,2Top 0.1%,Massachusetts Institute of Technology\r\n Edu...,USA,2,4,12,3,9,96.7
2,3Top 0.1%,Stanford University\r\n Education: A+; Employ...,USA,3,11,4,2,2,95.2
3,4Top 0.1%,University of Cambridge\r\n Education: A+; Em...,United Kingdom,1,3,25,5,11,94.1
4,5Top 0.1%,University of Oxford\r\n Education: A+; Emplo...,United Kingdom,2,7,27,9,4,93.3
5,6Top 0.1%,Princeton University\r\n Education: A+; Emplo...,USA,4,5,16,6,82,92.6
6,7Top 0.1%,University of Chicago\r\n Education: A+; Empl...,USA,5,8,15,22,28,92.1
7,8Top 0.1%,Columbia University\r\n Education: A+; Employ...,USA,6,12,14,10,13,91.6
8,9Top 0.1%,University of Pennsylvania\r\n Education: A; ...,USA,7,14,8,45,12,91.2
9,10Top 0.1%,Yale University\r\n Education: A+; Employabil...,USA,8,6,35,13,21,90.8


## 1. Data Profiling:

In [8]:
wru_ranking.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   World Rank          2000 non-null   object 
 1   University Names    2000 non-null   object 
 2   Location            2000 non-null   object 
 3   National Rank       2000 non-null   int64  
 4   Educational Rank    2000 non-null   object 
 5   Employability Rank  2000 non-null   object 
 6   Faculty Rank        2000 non-null   object 
 7   Research Rank       2000 non-null   object 
 8   Score               2000 non-null   float64
dtypes: float64(1), int64(1), object(7)
memory usage: 140.8+ KB


In [9]:
wru_ranking.shape

(2000, 9)

In [10]:
wru_ranking.dtypes

World Rank             object
University Names       object
Location               object
National Rank           int64
Educational Rank       object
Employability Rank     object
Faculty Rank           object
Research Rank          object
Score                 float64
dtype: object

In [11]:
wru_ranking.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   World Rank          2000 non-null   object 
 1   University Names    2000 non-null   object 
 2   Location            2000 non-null   object 
 3   National Rank       2000 non-null   int64  
 4   Educational Rank    2000 non-null   object 
 5   Employability Rank  2000 non-null   object 
 6   Faculty Rank        2000 non-null   object 
 7   Research Rank       2000 non-null   object 
 8   Score               2000 non-null   float64
dtypes: float64(1), int64(1), object(7)
memory usage: 140.8+ KB


## Data Quality Checks

### 1. Reliability of the data

Data is reliable from the source as it is consistent with their website.

### 2. Timeliness of the data

Data is updated as of 2024

### 3. Consistency of the data

Data is consistent with data found on wru website

### 4. Relevance of the data

In [12]:
wru_ranking.columns

Index(['World Rank', 'University Names', 'Location ', 'National Rank',
       'Educational Rank', 'Employability Rank', 'Faculty Rank',
       'Research Rank', 'Score'],
      dtype='object')

The data is relevant to the case study we intend to study  and its columns are relevant to the questions we intend to answer.

### 5. Uniqueness of the data

In [13]:
wru_ranking.duplicated().sum()

0

No duplicated rows found in the dataset

### 6. Completeness of the data

In [14]:
wru_ranking.isnull().sum()

World Rank            0
University Names      0
Location              0
National Rank         0
Educational Rank      0
Employability Rank    0
Faculty Rank          0
Research Rank         0
Score                 0
dtype: int64

We can see that there are no null values. However nulls are represented by - in the dataset. We will replace these with NaNs.

In [15]:
(wru_ranking['National Rank']=='-').sum()

0

In [16]:
(wru_ranking['Educational Rank']=='-').sum()

1562

In [17]:
(wru_ranking['Employability Rank']=='-').sum()

967

In [18]:
(wru_ranking['Faculty Rank']=='-').sum()

1727

In [19]:
(wru_ranking['Research Rank']=='-').sum()

66

In [20]:
(wru_ranking['Score']=='-').sum()

0

In [21]:
wru_ranking['National Rank'] = wru_ranking['National Rank'].replace('-', np.nan)
wru_ranking['National Rank'] = wru_ranking['National Rank'].astype(float)

wru_ranking['Educational Rank'] = wru_ranking['Educational Rank'].replace('-', np.nan)
wru_ranking['Educational Rank'] = wru_ranking['Educational Rank'].astype(float)

wru_ranking['Employability Rank'] = wru_ranking['Employability Rank'].replace('-', np.nan)
wru_ranking['Employability Rank'] = wru_ranking['Employability Rank'].astype(float)

wru_ranking['Faculty Rank'] = wru_ranking['Faculty Rank'].replace('-', np.nan)
wru_ranking['Faculty Rank'] = wru_ranking['Faculty Rank'].astype(float)

wru_ranking['Research Rank'] = wru_ranking['Research Rank'].replace('-', np.nan)
wru_ranking['Research Rank'] = wru_ranking['Research Rank'].astype(float)

In [22]:
wru_ranking.isnull().sum()

World Rank               0
University Names         0
Location                 0
National Rank            0
Educational Rank      1562
Employability Rank     967
Faculty Rank          1727
Research Rank           66
Score                    0
dtype: int64

Now We can handle the missing values better in the data cleaning section.

In [23]:
wru_ranking.head()

Unnamed: 0,World Rank,University Names,Location,National Rank,Educational Rank,Employability Rank,Faculty Rank,Research Rank,Score
0,1Top 0.1%,Harvard University\r\n CWUR Rating System: ...,USA,1.0,1.0,1.0,1.0,1.0,100.0
1,2Top 0.1%,Massachusetts Institute of Technology\r\n Edu...,USA,2.0,4.0,12.0,3.0,9.0,96.7
2,3Top 0.1%,Stanford University\r\n Education: A+; Employ...,USA,3.0,11.0,4.0,2.0,2.0,95.2
3,4Top 0.1%,University of Cambridge\r\n Education: A+; Em...,United Kingdom,1.0,3.0,25.0,5.0,11.0,94.1
4,5Top 0.1%,University of Oxford\r\n Education: A+; Emplo...,United Kingdom,2.0,7.0,27.0,9.0,4.0,93.3


### 7. Accuracy of the data

In [24]:
wru_ranking.dtypes

World Rank             object
University Names       object
Location               object
National Rank         float64
Educational Rank      float64
Employability Rank    float64
Faculty Rank          float64
Research Rank         float64
Score                 float64
dtype: object

The types we have is accurate. Except the world rank we will change it to integer.
We will change the data types of the ranks to integers in the updated nonull dataframe.

We will now check outliers in the data using boxplots.

In [59]:
fig = px.box(data_frame=wru_ranking, y="Score")
fig.show()

There are many upper whiskers in the boxplot. However, it is logical to have these outliers as the data is based on the performance of the universities. We will not remove these outliers.

In [60]:
fig = px.box(data_frame=wru_ranking, y="Research Rank")
fig.show()

As this columns represents data, it is logical to have no outliers in the data.

## Data Cleaning

We will change the data types of the world ranks column to integer.

In [27]:
wru_ranking['World Rank'] = wru_ranking['World Rank'].apply(lambda x: x.split('T')[0])
wru_ranking['World Rank'] = wru_ranking['World Rank'].astype(int)
wru_ranking.head()

Unnamed: 0,World Rank,University Names,Location,National Rank,Educational Rank,Employability Rank,Faculty Rank,Research Rank,Score
0,1,Harvard University\r\n CWUR Rating System: ...,USA,1.0,1.0,1.0,1.0,1.0,100.0
1,2,Massachusetts Institute of Technology\r\n Edu...,USA,2.0,4.0,12.0,3.0,9.0,96.7
2,3,Stanford University\r\n Education: A+; Employ...,USA,3.0,11.0,4.0,2.0,2.0,95.2
3,4,University of Cambridge\r\n Education: A+; Em...,United Kingdom,1.0,3.0,25.0,5.0,11.0,94.1
4,5,University of Oxford\r\n Education: A+; Emplo...,United Kingdom,2.0,7.0,27.0,9.0,4.0,93.3


In [28]:
wru_ranking.dtypes

World Rank              int32
University Names       object
Location               object
National Rank         float64
Educational Rank      float64
Employability Rank    float64
Faculty Rank          float64
Research Rank         float64
Score                 float64
dtype: object

Then We will clean the university names column

In [29]:
wru_ranking['University Names'] = wru_ranking['University Names'].apply(lambda x: x.split("\n")[0])
wru_ranking.head()

Unnamed: 0,World Rank,University Names,Location,National Rank,Educational Rank,Employability Rank,Faculty Rank,Research Rank,Score
0,1,Harvard University\r,USA,1.0,1.0,1.0,1.0,1.0,100.0
1,2,Massachusetts Institute of Technology\r,USA,2.0,4.0,12.0,3.0,9.0,96.7
2,3,Stanford University\r,USA,3.0,11.0,4.0,2.0,2.0,95.2
3,4,University of Cambridge\r,United Kingdom,1.0,3.0,25.0,5.0,11.0,94.1
4,5,University of Oxford\r,United Kingdom,2.0,7.0,27.0,9.0,4.0,93.3


We have to handle the null data. So we will drops the NaN values in the dataset and create a dataset with no null values. So we can study the ranks that have missing values better. and use the original dataset to study the general ranks.

In [30]:
wru_nonull = wru_ranking.dropna(axis=0, how='any')
wru_nonull.isnull().sum()

World Rank            0
University Names      0
Location              0
National Rank         0
Educational Rank      0
Employability Rank    0
Faculty Rank          0
Research Rank         0
Score                 0
dtype: int64

And we will change the data types of the ranks to integers.

In [31]:
wru_nonull['National Rank'] = wru_nonull['National Rank'].astype(int)
wru_nonull['Educational Rank'] = wru_nonull['Educational Rank'].astype(int)
wru_nonull['Employability Rank'] = wru_nonull['Employability Rank'].astype(int)
wru_nonull['Faculty Rank'] = wru_nonull['Faculty Rank'].astype(int)
wru_nonull['Research Rank'] = wru_nonull['Research Rank'].astype(int)

wru_nonull.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wru_nonull['National Rank'] = wru_nonull['National Rank'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wru_nonull['Educational Rank'] = wru_nonull['Educational Rank'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wru_nonull['Employability Rank'] = wru_nonull['Employabilit

World Rank              int32
University Names       object
Location               object
National Rank           int32
Educational Rank        int32
Employability Rank      int32
Faculty Rank            int32
Research Rank           int32
Score                 float64
dtype: object

In [32]:
wru_ranking.sort_values(by='Employability Rank', ascending=True).head(10)

Unnamed: 0,World Rank,University Names,Location,National Rank,Educational Rank,Employability Rank,Faculty Rank,Research Rank,Score
0,1,Harvard University\r,USA,1.0,1.0,1.0,1.0,1.0,100.0
119,120,INSEAD,France,6.0,,2.0,,1710.0,81.1
210,211,Institut national du service public,France,11.0,,3.0,,,78.5
2,3,Stanford University\r,USA,3.0,11.0,4.0,2.0,2.0,95.2
273,274,China Europe International Business School,China,32.0,,5.0,,,77.3
12,13,University of Tokyo,Japan,1.0,37.0,6.0,113.0,30.0,89.8
279,280,HEC Paris,France,15.0,,7.0,,,77.2
8,9,University of Pennsylvania\r,USA,7.0,14.0,8.0,45.0,12.0,91.2
44,45,Institut Polytechnique de Paris,France,5.0,102.0,9.0,188.0,370.0,85.1
405,406,International Institute for Management Develop...,Switzerland,8.0,,10.0,,,75.3


In [33]:
wru_ranking.head()

Unnamed: 0,World Rank,University Names,Location,National Rank,Educational Rank,Employability Rank,Faculty Rank,Research Rank,Score
0,1,Harvard University\r,USA,1.0,1.0,1.0,1.0,1.0,100.0
1,2,Massachusetts Institute of Technology\r,USA,2.0,4.0,12.0,3.0,9.0,96.7
2,3,Stanford University\r,USA,3.0,11.0,4.0,2.0,2.0,95.2
3,4,University of Cambridge\r,United Kingdom,1.0,3.0,25.0,5.0,11.0,94.1
4,5,University of Oxford\r,United Kingdom,2.0,7.0,27.0,9.0,4.0,93.3


In [34]:
wru_ranking1 = wru_ranking[wru_ranking['World Rank'] == 1]
wru_ranking1

Unnamed: 0,World Rank,University Names,Location,National Rank,Educational Rank,Employability Rank,Faculty Rank,Research Rank,Score
0,1,Harvard University\r,USA,1.0,1.0,1.0,1.0,1.0,100.0


## Data Visualization

In [96]:
fig = px.histogram(wru_ranking, x="Score", title="Distribution of Scores")

fig.show()

We can see that the histogram above indicates a right skewness in the data. This is logical as the higher scores are less common.

In [108]:
top_10_universities = wru_ranking.sort_values(by='World Rank', ascending=False).tail(10)
fig = px.bar(top_10_universities, x='World Rank', y='University Names', orientation='h',
             title='Top 10 Universities', labels={'World Rank': 'World Rank'}, color='University Names')
fig.show()

This bar plot answers the first question.

From the bar plot above we can see the top 10 universities in the world. With Harvard University being the best university in the world.

In [91]:
top_10_universities = wru_ranking.sort_values(by='Employability Rank', ascending=True).head(10)
fig = px.bar(top_10_universities, x='Employability Rank', y='University Names', orientation='h',
             title='Top 10 Universities by Employability Rank', labels={'Employability Rank': 'Employability Rank'})
fig.update_layout(xaxis={'categoryorder':'total ascending'})
fig.show()

This bar plot answers the second question.

From the bar plot above we can see the top 10 universities in the world, in term of Employment after graduation. With Harvard University being the best university in the world. Also we can notice differences between the top 10 here and the top 10 overall.

In [102]:
corr_matrix = wru_ranking.corr()
fig = px.imshow(corr_matrix.values,
                labels=dict(color="Correlation"),
                x=corr_matrix.index,
                y=corr_matrix.columns,
                color_continuous_scale='Purp')

fig.update_layout(title='Correlation Matrix',
                  width=800,
                  height=600)

annotations = []
for i, row in enumerate(corr_matrix.index):
    for j, col in enumerate(corr_matrix.columns):
        annotations.append(dict(x=col, y=row, text=str(corr_matrix.iloc[i, j].round(2)),
                                showarrow=False, font=dict(color='black')))
fig.update_layout(annotations=annotations)

fig.show()





This heatmap answers the third question.

As we can see from the heatmap, the highest correlation is with the Research rank. Second is Faculty rank. And the lowest correlation is with the Educational rank. However, we must consider that there are many null values in the ranks other than the national rank (0), and the research rank (66). Thus affecting the correlation.

In [95]:
fig = px.pie(wru_ranking.head(100), names='Location ', title='Location of Top 100 Universities')
fig.show()

To answer the third question, we graphed this pie chart to show the distribution of the universities in the dataset by country. We can see that the United States has the most universities in the dataset.

In [80]:
fig = px.scatter(wru_ranking.head(100), x="National Rank", y="World Rank", 
                 color="Location ", size="Score", hover_name="University Names",
                 title="Correlation between National and Global University Rankings")


fig.show()

Then from the scatter plot above, we can see how there is a positive correlation between the world rank and the research rank. And we can see the strongest relation between the national rank and the world rank is in the USA, followed by the UK.

In [88]:
top_10_universities = wru_ranking.sort_values(by='World Rank', ascending=True).tail(10)
fig = px.bar(top_10_universities, x='World Rank', y='University Names', orientation='h',
             title='Top 10 Universities', labels={'World Rank': 'World Rank'})
fig.show()

This bar plot answers an extra question on what 10 universities ranks the least in the top 2000 universities.


In [112]:
wru_saudi = wru_ranking[wru_ranking['Location '] == 'Saudi Arabia']
top_10_saudi = wru_saudi.sort_values(by='World Rank', ascending=False).tail(10)
fig = px.bar(top_10_saudi, x='World Rank', y='University Names', orientation='h',
             title='Top 10 Universities in Saudi Arabia', labels={'World Rank': 'World Rank'})
fig.show()

This barplot answers an extra question on the top 10 ranked universities in saudi arabia.