In [80]:
# Set up and import dependencies
import pandas as pd
from sqlalchemy import create_engine
import re

import warnings
warnings.filterwarnings("ignore")

In [81]:
# Read input csv datafile
uni_2019 = pd.read_csv("Resources/QS World University Rankings 2019.csv")
uni_2019.head()

Unnamed: 0,year,rank_display,university,score,link,country,city,region,logo
0,2019,1,Massachusetts Institute of Technology (MIT),100.0,https://www.topuniversities.com/universities/m...,United States,Cambridge,North America,https://www.topuniversities.com/sites/default/...
1,2019,2,Stanford University,98.6,https://www.topuniversities.com/universities/s...,United States,Stanford,North America,https://www.topuniversities.com/sites/default/...
2,2019,3,Harvard University,98.5,https://www.topuniversities.com/universities/h...,United States,Cambridge,North America,https://www.topuniversities.com/sites/default/...
3,2019,4,California Institute of Technology (Caltech),97.2,https://www.topuniversities.com/universities/c...,United States,Pasadena,North America,https://www.topuniversities.com/sites/default/...
4,2019,5,University of Oxford,96.8,https://www.topuniversities.com/universities/u...,United Kingdom,Oxford,Europe,https://www.topuniversities.com/sites/default/...


In [82]:
# Create pandas dataframe to hold the csv data
uni_rank_2019 = []
uni_rank_2019 = uni_2019[["year", "rank_display","university","score","country","city","region","link"]].copy()

In [83]:
# Analyse dataframe

uni_rank_2019.reset_index(drop=True,inplace=True)
uni_rank_2019

Unnamed: 0,year,rank_display,university,score,country,city,region,link
0,2019,1,Massachusetts Institute of Technology (MIT),100.0,United States,Cambridge,North America,https://www.topuniversities.com/universities/m...
1,2019,2,Stanford University,98.6,United States,Stanford,North America,https://www.topuniversities.com/universities/s...
2,2019,3,Harvard University,98.5,United States,Cambridge,North America,https://www.topuniversities.com/universities/h...
3,2019,4,California Institute of Technology (Caltech),97.2,United States,Pasadena,North America,https://www.topuniversities.com/universities/c...
4,2019,5,University of Oxford,96.8,United Kingdom,Oxford,Europe,https://www.topuniversities.com/universities/u...
...,...,...,...,...,...,...,...,...
1013,2019,1000+,Université de Technologie de Compiègne (UTC),,France,Compiègne,Europe,https://www.topuniversities.com/universities/u...
1014,2019,1000+,"University of California, San Francisco",,United States,San Francisco,North America,https://www.topuniversities.com/universities/u...
1015,2019,1000+,Prague University of Economics and Business,,Czech Republic,Prague,Europe,https://www.topuniversities.com/universities/p...
1016,2019,1000+,Weizmann Institute of Science,,Israel,Rehovot,Asia,https://www.topuniversities.com/universities/w...


In [84]:
# Check for duplicated data
uni_rank_2019.duplicated().sum()

0

In [85]:
# Check for a NaN in the entire dataframe
uni_rank_2019.isnull().values.any()

True

In [86]:
# Locate and count the NaNs in the dataframe columns
uni_rank_2019.isnull().sum()

year              0
rank_display      0
university        0
score           515
country           0
city             25
region            0
link              0
dtype: int64

In [87]:
# Get information of all columns in the dataframe
uni_rank_2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1018 entries, 0 to 1017
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   year          1018 non-null   int64  
 1   rank_display  1018 non-null   object 
 2   university    1018 non-null   object 
 3   score         503 non-null    float64
 4   country       1018 non-null   object 
 5   city          993 non-null    object 
 6   region        1018 non-null   object 
 7   link          1018 non-null   object 
dtypes: float64(1), int64(1), object(6)
memory usage: 63.8+ KB


In [88]:
# Transform data (cleanse) - Column: City

# Locate the position of the string that may contain city name in the university link
len("https://www.topuniversities.com/universities/")

45

In [89]:
# Extract the string that contains 'city name' from the url and add to the dataframe as a new column
uni_rank_2019["city_extract"] = uni_rank_2019['link'].str[45:]
uni_rank_2019

Unnamed: 0,year,rank_display,university,score,country,city,region,link,city_extract
0,2019,1,Massachusetts Institute of Technology (MIT),100.0,United States,Cambridge,North America,https://www.topuniversities.com/universities/m...,massachusetts-institute-technology-mit
1,2019,2,Stanford University,98.6,United States,Stanford,North America,https://www.topuniversities.com/universities/s...,stanford-university
2,2019,3,Harvard University,98.5,United States,Cambridge,North America,https://www.topuniversities.com/universities/h...,harvard-university
3,2019,4,California Institute of Technology (Caltech),97.2,United States,Pasadena,North America,https://www.topuniversities.com/universities/c...,california-institute-technology-caltech
4,2019,5,University of Oxford,96.8,United Kingdom,Oxford,Europe,https://www.topuniversities.com/universities/u...,university-oxford
...,...,...,...,...,...,...,...,...,...
1013,2019,1000+,Université de Technologie de Compiègne (UTC),,France,Compiègne,Europe,https://www.topuniversities.com/universities/u...,universite-de-technologie-de-compiegne-utc
1014,2019,1000+,"University of California, San Francisco",,United States,San Francisco,North America,https://www.topuniversities.com/universities/u...,university-california-san-francisco
1015,2019,1000+,Prague University of Economics and Business,,Czech Republic,Prague,Europe,https://www.topuniversities.com/universities/p...,prague-university-economics-business
1016,2019,1000+,Weizmann Institute of Science,,Israel,Rehovot,Asia,https://www.topuniversities.com/universities/w...,weizmann-institute-science


In [90]:
# Drop the unwanted link column
uni_rank_2019 = uni_rank_2019.drop(["link"], axis=1)

# Locate the NaN value in the city column
uni_rank_2019[uni_rank_2019["city"].isnull()]

Unnamed: 0,year,rank_display,university,score,country,city,region,city_extract
49,2019,50,Université PSL,75.1,France,,Europe,universite-psl
74,2019,75,Sorbonne University,64.8,France,,Europe,sorbonne-university
140,2019,141,Aarhus University,50.7,Denmark,,Europe,aarhus-university
179,2019,180,Queen's University Belfast,45.9,United Kingdom,,Europe,queens-university-belfast
240,2019,239,Université Paris-Saclay,38.9,France,,Europe,universite-paris-saclay
264,2019,264,Kyung Hee University,36.7,South Korea,,Asia,kyung-hee-university
331,2019,332,Brunel University London,32.0,United Kingdom,,Europe,brunel-university-london
363,2019,363,Oxford Brookes University,30.1,United Kingdom,,Europe,oxford-brookes-university
372,2019,373,National Research Tomsk Polytechnic University,29.4,Russia,,Europe,national-research-tomsk-polytechnic-university
434,2019,435,Bangor University,26.4,United Kingdom,,Europe,bangor-university


In [91]:
# Count the NaN value in the city column
uni_rank_2019["city"].isnull().sum()

25

In [92]:
# Initialise a new list and copy over the data for the cleansing process
df = []
df = uni_rank_2019.copy()

In [93]:
# Split the city_extract column into multiple columns to get the city name
df["city_e1"] = df["city_extract"].str.split('-').str[0]
df["city_e2"] = df["city_extract"].str.split('-').str[1]
df["city_e3"] = df["city_extract"].str.split('-').str[2]
df["city_e4"] = df["city_extract"].str.split('-').str[3]
df["city_e5"] = df["city_extract"].str.split('-').str[4]
df["city_e6"] = df["city_extract"].str.split('-').str[5]
df[df["city"].isnull()]

Unnamed: 0,year,rank_display,university,score,country,city,region,city_extract,city_e1,city_e2,city_e3,city_e4,city_e5,city_e6
49,2019,50,Université PSL,75.1,France,,Europe,universite-psl,universite,psl,,,,
74,2019,75,Sorbonne University,64.8,France,,Europe,sorbonne-university,sorbonne,university,,,,
140,2019,141,Aarhus University,50.7,Denmark,,Europe,aarhus-university,aarhus,university,,,,
179,2019,180,Queen's University Belfast,45.9,United Kingdom,,Europe,queens-university-belfast,queens,university,belfast,,,
240,2019,239,Université Paris-Saclay,38.9,France,,Europe,universite-paris-saclay,universite,paris,saclay,,,
264,2019,264,Kyung Hee University,36.7,South Korea,,Asia,kyung-hee-university,kyung,hee,university,,,
331,2019,332,Brunel University London,32.0,United Kingdom,,Europe,brunel-university-london,brunel,university,london,,,
363,2019,363,Oxford Brookes University,30.1,United Kingdom,,Europe,oxford-brookes-university,oxford,brookes,university,,,
372,2019,373,National Research Tomsk Polytechnic University,29.4,Russia,,Europe,national-research-tomsk-polytechnic-university,national,research,tomsk,polytechnic,university,
434,2019,435,Bangor University,26.4,United Kingdom,,Europe,bangor-university,bangor,university,,,,


In [94]:
# Update the city column with the name extracted from the university URL
df.iloc[[49],[5]] = "Paris"
df.iloc[[74],[5]] = "Paris"
df.iloc[[140],[5]] = "Aarhus"
df.iloc[[179],[5]] = "Belfast"
df.iloc[[240],[5]] = "Paris"
df.iloc[[264],[5]] = "Kyung Hee"
df.iloc[[331],[5]] = "London"
df.iloc[[363],[5]] = "Oxford"
df.iloc[[372],[5]] = "Tomsk"
df.iloc[[434],[5]] = "Bangor"
df.iloc[[444],[5]] = "Macau"
df.iloc[[481],[5]] = "Kuala Lumpur"
df.iloc[[499],[5]] = "Singapore"
df.iloc[[539],[5]] = "Paris"
df.iloc[[678],[5]] = "Olomouc"
df.iloc[[686],[5]] = "Lille"
df.iloc[[741],[5]] = "Greenwich"
df.iloc[[761],[5]] = "Madaba"
df.iloc[[762],[5]] = "New Delhi"
df.iloc[[772],[5]] = "Lviv"
df.iloc[[794],[5]] = "Brighton"
df.iloc[[831],[5]] = "Changsha"
df.iloc[[875],[5]] = "Poznan"
df.iloc[[934],[5]] = "Vigo"
df.iloc[[1011],[5]] = "Stockholm"

In [95]:
# Check whether there is any NaN in the city column
df.loc[(df["city"].isnull())] 

Unnamed: 0,year,rank_display,university,score,country,city,region,city_extract,city_e1,city_e2,city_e3,city_e4,city_e5,city_e6


In [96]:
# Drop the unused columns
df = df.drop(["city_e1","city_e2","city_e3","city_e4","city_e5","city_e6","city_extract"], axis=1)
df.head()

Unnamed: 0,year,rank_display,university,score,country,city,region
0,2019,1,Massachusetts Institute of Technology (MIT),100.0,United States,Cambridge,North America
1,2019,2,Stanford University,98.6,United States,Stanford,North America
2,2019,3,Harvard University,98.5,United States,Cambridge,North America
3,2019,4,California Institute of Technology (Caltech),97.2,United States,Pasadena,North America
4,2019,5,University of Oxford,96.8,United Kingdom,Oxford,Europe


In [97]:
# Transform data (cleanse) - Column: Rank_Display & Score

# Split the unwanted characters, capture the ranking from the column rank_display and store the data in the new column
df["rank"] = df['rank_display'].str.split('-').str[0]
df["rank"] = df['rank_display'].str.split('+').str[0]
df.tail(10)

Unnamed: 0,year,rank_display,university,score,country,city,region,rank
1008,2019,1000+,Karolinska Institutet,,Sweden,Stockholm,Europe,1000
1009,2019,1000+,King Abdullah University of Science & Technolo...,,Saudi Arabia,Thuwal,Asia,1000
1010,2019,1000+,London Business School,,United Kingdom,London,Europe,1000
1011,2019,1000+,Stockholm School of Economics,,Sweden,Stockholm,Europe,1000
1012,2019,1000+,Bocconi University,,Italy,Milan,Europe,1000
1013,2019,1000+,Université de Technologie de Compiègne (UTC),,France,Compiègne,Europe,1000
1014,2019,1000+,"University of California, San Francisco",,United States,San Francisco,North America,1000
1015,2019,1000+,Prague University of Economics and Business,,Czech Republic,Prague,Europe,1000
1016,2019,1000+,Weizmann Institute of Science,,Israel,Rehovot,Asia,1000
1017,2019,1000+,WHU - Otto Beisheim School of Management,,Germany,Vallendar,Europe,1000


In [98]:
# Check whether the column rank contains any unwanted characters
df[df["rank"].str.match('^=.*')==True]

Unnamed: 0,year,rank_display,university,score,country,city,region,rank


In [99]:
# Remove all the unwanted characters
df["rank"].replace("=",'',regex=True, inplace=True)
df["rank"] = df['rank'].str.split('-').str[0]

# Check whether all the unwanted characters had been removed
df[df["rank"].str.match('^=.*')==True]

Unnamed: 0,year,rank_display,university,score,country,city,region,rank


In [100]:
# Fill all NaN values with 0 (zero)
df["score"] =  df["score"].fillna(0)
df["rank_display"] = df["rank_display"].fillna(0)
df["rank"] = df["rank"].fillna(0)

# Check to see whether there is any NaN value left in the dataframe
df.isnull().sum()

year            0
rank_display    0
university      0
score           0
country         0
city            0
region          0
rank            0
dtype: int64

In [101]:
# Convert the column rank to integer
df["rank"] = pd.to_numeric(df["rank"])

In [102]:
# Set index column as table "id" column
df.reset_index(drop=True,inplace=True)
df['id'] = df.index
df.set_index("id")

Unnamed: 0_level_0,year,rank_display,university,score,country,city,region,rank
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,2019,1,Massachusetts Institute of Technology (MIT),100.0,United States,Cambridge,North America,1
1,2019,2,Stanford University,98.6,United States,Stanford,North America,2
2,2019,3,Harvard University,98.5,United States,Cambridge,North America,3
3,2019,4,California Institute of Technology (Caltech),97.2,United States,Pasadena,North America,4
4,2019,5,University of Oxford,96.8,United Kingdom,Oxford,Europe,5
...,...,...,...,...,...,...,...,...
1013,2019,1000+,Université de Technologie de Compiègne (UTC),0.0,France,Compiègne,Europe,1000
1014,2019,1000+,"University of California, San Francisco",0.0,United States,San Francisco,North America,1000
1015,2019,1000+,Prague University of Economics and Business,0.0,Czech Republic,Prague,Europe,1000
1016,2019,1000+,Weizmann Institute of Science,0.0,Israel,Rehovot,Asia,1000


In [103]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1018 entries, 0 to 1017
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   year          1018 non-null   int64  
 1   rank_display  1018 non-null   object 
 2   university    1018 non-null   object 
 3   score         1018 non-null   float64
 4   country       1018 non-null   object 
 5   city          1018 non-null   object 
 6   region        1018 non-null   object 
 7   rank          1018 non-null   int64  
 8   id            1018 non-null   int64  
dtypes: float64(1), int64(3), object(5)
memory usage: 71.7+ KB


In [104]:
# Transform data - Finalise DataFrames

uni_rank_transformed = []
uni_rank_transformed = df[["id","year","rank","score"]].copy()
uni_rank_transformed.set_index("id",drop=True, inplace=True)
uni_rank_transformed.head()

Unnamed: 0_level_0,year,rank,score
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2019,1,100.0
1,2019,2,98.6
2,2019,3,98.5
3,2019,4,97.2
4,2019,5,96.8


In [106]:
uni_details_transformed = []
uni_details_transformed = df[["id","university","city","country","region"]].copy()
uni_details_transformed.set_index("id",drop=True, inplace=True)
uni_details_transformed.head()

Unnamed: 0_level_0,university,city,country,region
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Massachusetts Institute of Technology (MIT),Cambridge,United States,North America
1,Stanford University,Stanford,United States,North America
2,Harvard University,Cambridge,United States,North America
3,California Institute of Technology (Caltech),Pasadena,United States,North America
4,University of Oxford,Oxford,United Kingdom,Europe


In [109]:
# Create postgresql database connection

connection_string = "postgres:<pass wd>@localhost:5432/QS_Uni_Ranking_2019"
engine = create_engine(f'postgresql://{connection_string}')

In [110]:
# Confirm tables creation in pgAdmin4
engine.table_names()

['uni_rank', 'uni_details']

In [111]:
# Load pandas DataFrames into database

uni_rank_transformed.to_sql(name='uni_rank', con=engine, if_exists='append', index=True)

In [112]:
uni_details_transformed.to_sql(name='uni_details', con=engine, if_exists='append', index=True)