In [1]:
import pandas as pd
import re
import math
import numpy as np

## Preprocessing

In [2]:
shanghaiDataset = "https://raw.githubusercontent.com/DNQT/Foundation-of-Computer-Science-Project/main/Reference%20Databases/shanghaiData.csv"
timesDataset = "https://raw.githubusercontent.com/DNQT/Foundation-of-Computer-Science-Project/main/Reference%20Databases/timesData.csv"
worldDataset="https://raw.githubusercontent.com/DNQT/Foundation-of-Computer-Science-Project/main/Reference%20Databases/cwurData.csv"
school_and_country_Dataset="https://raw.githubusercontent.com/DNQT/Foundation-of-Computer-Science-Project/main/Reference%20Databases/school_and_country_table.csv"

In [3]:
def readDataSet(url , sep =',' , encoding='utf8',header='infer'):
    return pd.read_csv(url,sep=sep ,header=header,encoding=encoding)

In [4]:
# read datasets
shanghai_df=readDataSet(shanghaiDataset)
times_df=readDataSet(timesDataset)
world_df=readDataSet(worldDataset)
school_and_country_df=readDataSet(school_and_country_Dataset)

In [5]:
#drop the none row from shanghai datframe

index =shanghai_df[shanghai_df['university_name'].isnull()==True]
shanghai_df.drop(index.index,inplace = True)
shanghai_df.reset_index(drop=True, inplace=True)

In [6]:
# rename institution column in university_name

world_df.rename(columns={'institution':'university_name'},inplace=True)

In [7]:
shanghai_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4896 entries, 0 to 4895
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   world_rank       4896 non-null   object 
 1   university_name  4896 non-null   object 
 2   national_rank    4896 non-null   object 
 3   total_score      1101 non-null   float64
 4   alumni           4896 non-null   float64
 5   award            4895 non-null   float64
 6   hici             4895 non-null   float64
 7   ns               4875 non-null   float64
 8   pub              4895 non-null   float64
 9   pcp              4895 non-null   float64
 10  year             4896 non-null   int64  
dtypes: float64(7), int64(1), object(3)
memory usage: 420.9+ KB


In [8]:
times_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2603 entries, 0 to 2602
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   world_rank              2603 non-null   object 
 1   university_name         2603 non-null   object 
 2   country                 2603 non-null   object 
 3   teaching                2603 non-null   float64
 4   international           2603 non-null   object 
 5   research                2603 non-null   float64
 6   citations               2603 non-null   float64
 7   income                  2603 non-null   object 
 8   total_score             2603 non-null   object 
 9   num_students            2544 non-null   object 
 10  student_staff_ratio     2544 non-null   float64
 11  international_students  2536 non-null   object 
 12  female_male_ratio       2370 non-null   object 
 13  year                    2603 non-null   int64  
dtypes: float64(4), int64(1), object(9)
memor

In [9]:
world_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   world_rank            2200 non-null   int64  
 1   university_name       2200 non-null   object 
 2   country               2200 non-null   object 
 3   national_rank         2200 non-null   int64  
 4   quality_of_education  2200 non-null   int64  
 5   alumni_employment     2200 non-null   int64  
 6   quality_of_faculty    2200 non-null   int64  
 7   publications          2200 non-null   int64  
 8   influence             2200 non-null   int64  
 9   citations             2200 non-null   int64  
 10  broad_impact          2000 non-null   float64
 11  patents               2200 non-null   int64  
 12  score                 2200 non-null   float64
 13  year                  2200 non-null   int64  
dtypes: float64(2), int64(10), object(2)
memory usage: 240.8+ KB


## 1. For each university, extract from the times dataset the most recent and the least recent data, obtaining two separate dataframes

In [10]:
def extract_dataset_min_max(university_name, year, index ):
    '''
    find most recent or least recent of data
    
    @university_name  : column university name 
    @year : column year
    @index : max or min
    
    '''
    dataset = pd.DataFrame()
    if(index == 'max'):
          dataset = times_df.iloc[times_df.groupby(university_name)[year].idxmax()]
            
    else: dataset = times_df.iloc[times_df.groupby(university_name)[year].idxmin()]
    return dataset

In [11]:
times_most_recent_data = extract_dataset_min_max('university_name','year','max')

In [12]:
times_least_recent_data = extract_dataset_min_max('university_name','year','min')

In [13]:
times_most_recent_data.to_csv("times_most_recent_data")
times_least_recent_data.to_csv("times_least_recent_data")

In [14]:
times_most_recent_data[['world_rank','university_name','year']].head(10)

Unnamed: 0,world_rank,university_name,year
2405,601-800,AGH University of Science and Technology,2016
2003,201-250,Aalborg University,2016
2056,251-300,Aalto University,2016
1908,=106,Aarhus University,2016
2105,301-350,Aberystwyth University,2016
2404,601-800,Adam Mickiewicz University,2016
2057,251-300,Aix-Marseille University,2016
2406,601-800,Ajou University,2016
2408,601-800,Alexandria University,2016
2409,601-800,Alexandru Ioan Cuza University,2016


In [15]:
times_least_recent_data[['world_rank','university_name','year']].head(10)

Unnamed: 0,world_rank,university_name,year
2405,601-800,AGH University of Science and Technology,2016
501,301-350,Aalborg University,2012
502,301-350,Aalto University,2012
166,167,Aarhus University,2011
476,276-300,Aberystwyth University,2012
2404,601-800,Adam Mickiewicz University,2016
2057,251-300,Aix-Marseille University,2016
2406,601-800,Ajou University,2016
146,147,Alexandria University,2011
2409,601-800,Alexandru Ioan Cuza University,2016



## 2. For each university, compute the improvement in income between the least recent and the most recent data points

nota : la colona "income" contiene valori "-" , percui prosego prima alla pulizia e poi lo converto in float

In [16]:
def clean_column_income(dataset):
    dataset=dataset[dataset.income.str.contains('-') ==False][['university_name','income','year']].copy()
    dataset.income=dataset.income.astype(float)
    return dataset
            

In [17]:
times_least_recent_data_clean = clean_column_income(times_least_recent_data)

In [18]:
times_most_recent_data_clean= clean_column_income(times_most_recent_data)

Unisco i due dataset tramite la colona "university_name"

In [19]:
union_most_and_least_data =pd.merge(times_least_recent_data_clean,times_most_recent_data_clean,on='university_name')

rinomino le colone 

In [20]:
union_most_and_least_data.rename(columns={'income_x':'income_level_least_recent','year_x':'year_least_recent','income_y':'income_level_most_recent','year_y':'year_most_recent'},inplace=True)
union_most_and_least_data

Unnamed: 0,university_name,income_level_least_recent,year_least_recent,income_level_most_recent,year_most_recent
0,Aalborg University,36.4,2012,43.7,2016
1,Aalto University,61.9,2012,61.6,2016
2,Aarhus University,61.5,2011,68.3,2016
3,Aberystwyth University,35.5,2012,31.3,2016
4,Adam Mickiewicz University,28.7,2016,28.7,2016
...,...,...,...,...,...
699,Zhejiang University,70.3,2011,96.2,2016
700,École Normale Supérieure,30.7,2011,37.1,2016
701,École Normale Supérieure de Lyon,26.1,2011,31.7,2016
702,École Polytechnique Fédérale de Lausanne,38.0,2011,65.4,2016


##### nota : alcuni universita hanno come "year_most_recent" e "year_most_recent" uguale percui non ha senso tenerli

In [21]:
union_most_and_least_data[union_most_and_least_data['year_least_recent']==union_most_and_least_data['year_most_recent']].head(100)

Unnamed: 0,university_name,income_level_least_recent,year_least_recent,income_level_most_recent,year_most_recent
4,Adam Mickiewicz University,28.7,2016,28.7,2016
5,Aix-Marseille University,33.1,2016,33.1,2016
6,Ajou University,45.7,2016,45.7,2016
8,Alexandru Ioan Cuza University,28.2,2016,28.2,2016
9,Aligarh Muslim University,29.6,2016,29.6,2016
...,...,...,...,...,...
179,Khon Kaen University,35.7,2016,35.7,2016
181,King Fahd University of Petroleum and Minerals,81.2,2016,81.2,2016
184,Kingston University,28.5,2016,28.5,2016
186,Kinki University,33.2,2016,33.2,2016


elimino queste colone

In [22]:
union_most_and_least_data.drop(union_most_and_least_data[union_most_and_least_data['year_least_recent']==union_most_and_least_data['year_most_recent']].index,inplace=True)

calcolo la diferenza

In [23]:
union_most_and_least_data["diference_income"] = union_most_and_least_data.income_level_most_recent - union_most_and_least_data.income_level_least_recent

creo una colona "improvement" che mi indica UP o Down in base se la income è positiva o negativa

In [24]:
union_most_and_least_data["improvement"] = np.where(union_most_and_least_data["diference_income"] > 0, "UP", "Down")

In [25]:
union_most_and_least_data.head(5)

Unnamed: 0,university_name,income_level_least_recent,year_least_recent,income_level_most_recent,year_most_recent,diference_income,improvement
0,Aalborg University,36.4,2012,43.7,2016,7.3,UP
1,Aalto University,61.9,2012,61.6,2016,-0.3,Down
2,Aarhus University,61.5,2011,68.3,2016,6.8,UP
3,Aberystwyth University,35.5,2012,31.3,2016,-4.2,Down
7,Alexandria University,36.0,2011,29.7,2016,-6.3,Down


## 3. Find the university with the largest increase computed in the previous point

In [26]:
 union_most_and_least_data[union_most_and_least_data.diference_income ==union_most_and_least_data.diference_income.max()]

Unnamed: 0,university_name,income_level_least_recent,year_least_recent,income_level_most_recent,year_most_recent,diference_income,improvement
373,TU Dresden,31.9,2012,99.7,2016,67.8,UP


## 4. For each ranking, consider only the most recent data point. For each university, compute the maximum difference between the rankings (e.g. for Aarhus University the value is 122-73=49). Notice that some rankings are expressed as a range

#### preprocessing 

nota: dataset shanghai_df , times_df ha alcuni valori espressi in range

In [27]:
shanghai_df[shanghai_df['world_rank'].str.contains('-')==True]['world_rank'].head(3)

100    101-152
101    101-152
102    101-152
Name: world_rank, dtype: object

nota : time_df porta con se alcuni valri con segno '=' davanti, che significa valore uguale, percui se lo tolgo non perde significato

In [28]:
times_df[times_df['world_rank'].str.contains('=')==True]['world_rank'].head(3)

1841    =39
1842    =39
1846    =44
Name: world_rank, dtype: object

In [29]:
times_df['world_rank']=times_df['world_rank'].str.replace('=','')

nota: world_df sembra ok , non ha valori in range e valori strani

In [30]:
world_df['world_rank'].dtype

dtype('int64')

 ### Dati più recenti per le tabelle 'shanghai_ranking', 'times_ranking' e 'world_ranking

Shanghai

In [31]:
shanghai_most_recent_data = shanghai_df.iloc[shanghai_df.groupby('university_name')['year'].idxmax()].copy()

In [32]:
shanghai_most_recent_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 658 entries, 4696 to 4545
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   world_rank       658 non-null    object 
 1   university_name  658 non-null    object 
 2   national_rank    658 non-null    object 
 3   total_score      112 non-null    float64
 4   alumni           658 non-null    float64
 5   award            658 non-null    float64
 6   hici             658 non-null    float64
 7   ns               655 non-null    float64
 8   pub              658 non-null    float64
 9   pcp              658 non-null    float64
 10  year             658 non-null    int64  
dtypes: float64(7), int64(1), object(3)
memory usage: 61.7+ KB


Times

In [33]:
times_most_recent_data = times_df.iloc[times_df.groupby('university_name')['year'].idxmax()].copy()

In [34]:
times_most_recent_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 818 entries, 2405 to 2134
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   world_rank              818 non-null    object 
 1   university_name         818 non-null    object 
 2   country                 818 non-null    object 
 3   teaching                818 non-null    float64
 4   international           818 non-null    object 
 5   research                818 non-null    float64
 6   citations               818 non-null    float64
 7   income                  818 non-null    object 
 8   total_score             818 non-null    object 
 9   num_students            796 non-null    object 
 10  student_staff_ratio     796 non-null    float64
 11  international_students  793 non-null    object 
 12  female_male_ratio       742 non-null    object 
 13  year                    818 non-null    int64  
dtypes: float64(4), int64(1), object(9)
mem

World  

In [35]:
world_most_recent_data = world_df.iloc[world_df.groupby('university_name')['year'].idxmax()].copy()

In [36]:
world_most_recent_data.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1024 entries, 1981 to 1838
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   world_rank            1024 non-null   int64  
 1   university_name       1024 non-null   object 
 2   country               1024 non-null   object 
 3   national_rank         1024 non-null   int64  
 4   quality_of_education  1024 non-null   int64  
 5   alumni_employment     1024 non-null   int64  
 6   quality_of_faculty    1024 non-null   int64  
 7   publications          1024 non-null   int64  
 8   influence             1024 non-null   int64  
 9   citations             1024 non-null   int64  
 10  broad_impact          1023 non-null   float64
 11  patents               1024 non-null   int64  
 12  score                 1024 non-null   float64
 13  year                  1024 non-null   int64  
dtypes: float64(2), int64(10), object(2)
memory usage: 120.0+ KB


### nota : Bisogna vedere se le universita hanno stesso nome in queste tabele, per questo facio un controllo 

In [37]:
union_world_times =pd.merge(world_most_recent_data[['world_rank','university_name']],times_most_recent_data[['world_rank','university_name']] ,on='university_name')
union_world_times

Unnamed: 0,world_rank_x,university_name,world_rank_y
0,782,AGH University of Science and Technology,601-800
1,565,Aalborg University,201-250
2,421,Aalto University,251-300
3,122,Aarhus University,106
4,814,Aberystwyth University,301-350
...,...,...,...
584,337,York University,301-350
585,979,Yuan Ze University,601-800
586,191,Zhejiang University,251-300
587,36,École Polytechnique,101


In [38]:
def findDiff(value1,value2):
    '''
        input1 : single or range value  , string
        input2 : single or range value, string
        
        return  the max difference between input1 and input2
        if input is a range take the max difference from other input
    
    '''
    
    pattern = re.compile("(\d+)\-(\d+)")
    value1isRange= False
    value2isRange= False
    
    #controllo se è numero, range , None
    try:
        if (value1 == None and value2 == None) :
            return 0
        if (value1 == None and value2 != None):
            return value2
        if (value2 == None and value1 != None):
            return value1
        
        
        if(pattern.search(value1)):
             value1min, value1max = value1.split('-')
             value1min = int(value1min)
             value1max = int(value1max)
             value1isRange=True
             
        else:
             value1= int(value1)
             
             
        if(pattern.search(value2)):
             value2min, value2max = value2.split('-')
             value2min = int(value2min)
             value2max = int(value2max)
             value2isRange=True
            
                
        else:
            value2 =int(value2)
            
        
        if(value1isRange == False and value2isRange == False):
            return abs(value1 - value2)
        if(value1isRange == True and value2isRange == False):
                if(value2 > value1max):
                    return abs(value2 - value1min)
                if( value2 < value1min):
                    return abs(value2 - value1max)
        if(value1isRange == False and value2isRange == True):
                 if(value1 > value2max):
                     return abs(value1 - value2min)
                 if( value1 < value2min):
                    return abs(value1 - value2max)
        if(value1isRange == True and value2isRange == True):
            valoremassimo = 0
            valoreminimo = 0
            if (value1max >value2max):
                valoremassimo = value1max
            else:
                valoremassimo = value2max
            if (value1min < value2min):
                valoreminimo = value1min
            else:
                valoreminimo = value2min
            return valoremassimo - valoreminimo
    except NameError:
            print(NameError)
    
    

In [39]:
def updatedict(inputdict,key,value):
    if key in inputdict :
        if inputdict[key] < value :
            inputdict[key]= value
    else:
        inputdict[key]=value

In [40]:
mydict = {}

In [41]:
def searchUniversityAndFindDiff(dataset1, dataset2, dataset3):
    
    for university, world_rank in zip(dataset1.loc[:,'university_name'], dataset1.loc[:,'world_rank']):
        if world_rank == None:
            continue
        if((dataset2.university_name == university).any()):
            result =findDiff(str(world_rank),str(dataset2[dataset2.university_name == university].world_rank.any()))
            if result == None:
                continue
            updatedict(mydict,university, result)
        if((dataset3.university_name == university).any()):
            result=findDiff(str(world_rank),str(dataset3[dataset3.university_name == university].world_rank.any()))
            if result == None:
                continue
            updatedict(mydict,university,result)
       
    

In [42]:
searchUniversityAndFindDiff(world_most_recent_data, times_most_recent_data, shanghai_most_recent_data)

In [43]:
mydict

{'Aalborg University': 364,
 'Aalto University': 170,
 'Aarhus University': 49,
 'Aberystwyth University': 513,
 'Aix-Marseille University': 104,
 'Ajou University': 367,
 'Alexandria University': 396,
 'American University of Beirut': 181,
 'Amirkabir University of Technology': 457,
 'Aristotle University of Thessaloniki': 341,
 'Arizona State University': 92,
 'Aston University': 319,
 'Australian National University': 127,
 'Autonomous University of Barcelona': 95,
 'Bangor University': 267,
 'Bar-Ilan University': 120,
 'Beihang University': 404,
 'Beijing Normal University': 195,
 'Ben-Gurion University of the Negev': 251,
 'Bielefeld University': 213,
 'Bilkent University': 491,
 'Birkbeck, University of London': 497,
 'Blaise Pascal University': 268,
 'Boston College': 174,
 'Boston University': 7,
 'Boğaziçi University': 336,
 'Brandeis University': 89,
 'Brigham Young University': 102,
 'Brown University': 32,
 'Budapest University of Technology and Economics': 218,
 'Californ