In [1]:
import pandas as pd
import re
import math
import numpy as np

## Preprocessing

In [2]:
shanghaiDataset = "https://raw.githubusercontent.com/riiduan/ProgettoFondamentiDiInformatica/main/data/shanghai.csv"
timesDataset = "https://raw.githubusercontent.com/riiduan/ProgettoFondamentiDiInformatica/main/data/times.csv"
worldDataset="https://raw.githubusercontent.com/riiduan/ProgettoFondamentiDiInformatica/main/data/world.csv"
educational_attainment_supplementary_data ="https://raw.githubusercontent.com/riiduan/ProgettoFondamentiDiInformatica/main/data/educational_attainment_supplementary_data.csv"

In [3]:
def readDataSet(url , sep =',' , encoding='utf8',header='infer'):
    return pd.read_csv(url,sep=sep ,header=header,encoding=encoding)

In [118]:
# read datasets
shanghai_df=readDataSet(shanghaiDataset)
times_df=readDataSet(timesDataset)
world_df=readDataSet(worldDataset)


In [5]:
#drop the none row from shanghai datframe

index =shanghai_df[shanghai_df['university_name'].isnull()==True]
shanghai_df.drop(index.index,inplace = True)
shanghai_df.reset_index(drop=True, inplace=True)

In [6]:
# rename institution column in university_name

world_df.rename(columns={'institution':'university_name'},inplace=True)

In [7]:
shanghai_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4896 entries, 0 to 4895
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       4896 non-null   int64  
 1   world_rank       4896 non-null   object 
 2   university_name  4896 non-null   object 
 3   national_rank    4896 non-null   object 
 4   total_score      1101 non-null   float64
 5   alumni           4896 non-null   float64
 6   award            4895 non-null   float64
 7   hici             4895 non-null   float64
 8   ns               4875 non-null   float64
 9   pub              4895 non-null   float64
 10  pcp              4895 non-null   float64
 11  year             4896 non-null   int64  
dtypes: float64(7), int64(2), object(3)
memory usage: 459.1+ KB


In [8]:
times_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2603 entries, 0 to 2602
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              2603 non-null   int64  
 1   world_rank              2603 non-null   object 
 2   university_name         2603 non-null   object 
 3   country                 2603 non-null   object 
 4   teaching                2603 non-null   float64
 5   international           2603 non-null   object 
 6   research                2603 non-null   float64
 7   citations               2603 non-null   float64
 8   income                  2603 non-null   object 
 9   total_score             2603 non-null   object 
 10  num_students            2544 non-null   object 
 11  student_staff_ratio     2544 non-null   float64
 12  international_students  2536 non-null   object 
 13  female_male_ratio       2370 non-null   object 
 14  year                    2603 non-null   

In [9]:
world_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0            2200 non-null   int64  
 1   world_rank            2200 non-null   int64  
 2   university_name       2200 non-null   object 
 3   country               2200 non-null   object 
 4   national_rank         2200 non-null   int64  
 5   quality_of_education  2200 non-null   int64  
 6   alumni_employment     2200 non-null   int64  
 7   quality_of_faculty    2200 non-null   int64  
 8   publications          2200 non-null   int64  
 9   influence             2200 non-null   int64  
 10  citations             2200 non-null   int64  
 11  broad_impact          2000 non-null   float64
 12  patents               2200 non-null   int64  
 13  score                 2200 non-null   float64
 14  year                  2200 non-null   int64  
dtypes: float64(2), int64(

## 1. For each university, extract from the times dataset the most recent and the least recent data, obtaining two separate dataframes

In [10]:
def extract_dataset_min_max(university_name, year, index ):
    '''
    find most recent or least recent of data
    
    @university_name  : column university name 
    @year : column year
    @index : max or min
    
    '''
    dataset = pd.DataFrame()
    if(index == 'max'):
          dataset = times_df.iloc[times_df.groupby(university_name)[year].idxmax()]
            
    else: dataset = times_df.iloc[times_df.groupby(university_name)[year].idxmin()]
    return dataset

In [11]:
times_most_recent_data = extract_dataset_min_max('university_name','year','max')

In [12]:
times_least_recent_data = extract_dataset_min_max('university_name','year','min')

In [13]:
times_most_recent_data.to_csv("times_most_recent_data")
times_least_recent_data.to_csv("times_least_recent_data")

In [14]:
times_most_recent_data[['world_rank','university_name','year']].head(10)

Unnamed: 0,world_rank,university_name,year
2405,601-800,AGH University of Science and Technology,2016
2003,201-250,Aalborg University,2016
2056,251-300,Aalto University,2016
1908,=106,Aarhus University,2016
2105,301-350,Aberystwyth University,2016
2404,601-800,Adam Mickiewicz University,2016
2057,251-300,Aix-Marseille University,2016
2406,601-800,Ajou University,2016
2408,601-800,Alexandria University,2016
2409,601-800,Alexandru Ioan Cuza University,2016


In [15]:
times_least_recent_data[['world_rank','university_name','year']].head(10)

Unnamed: 0,world_rank,university_name,year
2405,601-800,AGH University of Science and Technology,2016
501,301-350,Aalborg University,2012
502,301-350,Aalto University,2012
166,167,Aarhus University,2011
476,276-300,Aberystwyth University,2012
2404,601-800,Adam Mickiewicz University,2016
2057,251-300,Aix-Marseille University,2016
2406,601-800,Ajou University,2016
146,147,Alexandria University,2011
2409,601-800,Alexandru Ioan Cuza University,2016



## 2. For each university, compute the improvement in income between the least recent and the most recent data points

nota : la colona "income" contiene valori "-" , percui prosego prima alla pulizia e poi lo converto in float

In [16]:
def clean_column_income(dataset):
    dataset=dataset[dataset.income.str.contains('-') ==False][['university_name','income','year']].copy()
    dataset.income=dataset.income.astype(float)
    return dataset
            

In [17]:
times_least_recent_data_clean = clean_column_income(times_least_recent_data)

In [18]:
times_most_recent_data_clean= clean_column_income(times_most_recent_data)

Unisco i due dataset tramite la colona "university_name"

In [19]:
union_most_and_least_data =pd.merge(times_least_recent_data_clean,times_most_recent_data_clean,on='university_name')

rinomino le colone 

In [20]:
union_most_and_least_data.rename(columns={'income_x':'income_level_least_recent','year_x':'year_least_recent','income_y':'income_level_most_recent','year_y':'year_most_recent'},inplace=True)
union_most_and_least_data

Unnamed: 0,university_name,income_level_least_recent,year_least_recent,income_level_most_recent,year_most_recent
0,Aalborg University,36.4,2012,43.7,2016
1,Aalto University,61.9,2012,61.6,2016
2,Aarhus University,61.5,2011,68.3,2016
3,Aberystwyth University,35.5,2012,31.3,2016
4,Adam Mickiewicz University,28.7,2016,28.7,2016
...,...,...,...,...,...
699,Zhejiang University,70.3,2011,96.2,2016
700,École Normale Supérieure,30.7,2011,37.1,2016
701,École Normale Supérieure de Lyon,26.1,2011,31.7,2016
702,École Polytechnique Fédérale de Lausanne,38.0,2011,65.4,2016


##### nota : alcuni universita hanno come "year_most_recent" e "year_most_recent" uguale percui non ha senso tenerli

In [21]:
union_most_and_least_data[union_most_and_least_data['year_least_recent']==union_most_and_least_data['year_most_recent']].head(100)

Unnamed: 0,university_name,income_level_least_recent,year_least_recent,income_level_most_recent,year_most_recent
4,Adam Mickiewicz University,28.7,2016,28.7,2016
5,Aix-Marseille University,33.1,2016,33.1,2016
6,Ajou University,45.7,2016,45.7,2016
8,Alexandru Ioan Cuza University,28.2,2016,28.2,2016
9,Aligarh Muslim University,29.6,2016,29.6,2016
...,...,...,...,...,...
179,Khon Kaen University,35.7,2016,35.7,2016
181,King Fahd University of Petroleum and Minerals,81.2,2016,81.2,2016
184,Kingston University,28.5,2016,28.5,2016
186,Kinki University,33.2,2016,33.2,2016


elimino queste colone

In [22]:
union_most_and_least_data.drop(union_most_and_least_data[union_most_and_least_data['year_least_recent']==union_most_and_least_data['year_most_recent']].index,inplace=True)

calcolo la diferenza

In [23]:
union_most_and_least_data["diference_income"] = union_most_and_least_data.income_level_most_recent - union_most_and_least_data.income_level_least_recent

creo una colona "improvement" che mi indica UP o Down in base se la income è positiva o negativa

In [24]:
union_most_and_least_data["improvement"] = np.where(union_most_and_least_data["diference_income"] >= 0, "UP", "Down")

In [25]:
union_most_and_least_data.head(5)

Unnamed: 0,university_name,income_level_least_recent,year_least_recent,income_level_most_recent,year_most_recent,diference_income,improvement
0,Aalborg University,36.4,2012,43.7,2016,7.3,UP
1,Aalto University,61.9,2012,61.6,2016,-0.3,Down
2,Aarhus University,61.5,2011,68.3,2016,6.8,UP
3,Aberystwyth University,35.5,2012,31.3,2016,-4.2,Down
7,Alexandria University,36.0,2011,29.7,2016,-6.3,Down


## 3. Find the university with the largest increase computed in the previous point

In [26]:
 union_most_and_least_data[union_most_and_least_data.diference_income ==union_most_and_least_data.diference_income.max()]

Unnamed: 0,university_name,income_level_least_recent,year_least_recent,income_level_most_recent,year_most_recent,diference_income,improvement
373,TU Dresden,31.9,2012,99.7,2016,67.8,UP


## 4. For each ranking, consider only the most recent data point. For each university, compute the maximum difference between the rankings (e.g. for Aarhus University the value is 122-73=49). Notice that some rankings are expressed as a range

#### preprocessing 

nota: dataset shanghai_df , times_df ha alcuni valori espressi in range

In [27]:
shanghai_df[shanghai_df['world_rank'].str.contains('-')==True]['world_rank'].head(3)

100    101-152
101    101-152
102    101-152
Name: world_rank, dtype: object

nota : time_df porta con se alcuni valri con segno '=' davanti, che significa valore uguale, percui se lo tolgo non perde significato

In [28]:
times_df[times_df['world_rank'].str.contains('=')==True]['world_rank'].head(3)

1841    =39
1842    =39
1846    =44
Name: world_rank, dtype: object

In [29]:
times_df['world_rank']=times_df['world_rank'].str.replace('=','')

nota: world_df sembra ok , non ha valori in range e valori strani

In [30]:
world_df['world_rank'].dtype

dtype('int64')

 ### Dati più recenti per le tabelle 'shanghai_ranking', 'times_ranking' e 'world_ranking

Shanghai

In [31]:
shanghai_most_recent_data = shanghai_df.iloc[shanghai_df.groupby('university_name')['year'].idxmax()].copy()

In [32]:
shanghai_most_recent_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 658 entries, 4696 to 4545
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       658 non-null    int64  
 1   world_rank       658 non-null    object 
 2   university_name  658 non-null    object 
 3   national_rank    658 non-null    object 
 4   total_score      112 non-null    float64
 5   alumni           658 non-null    float64
 6   award            658 non-null    float64
 7   hici             658 non-null    float64
 8   ns               655 non-null    float64
 9   pub              658 non-null    float64
 10  pcp              658 non-null    float64
 11  year             658 non-null    int64  
dtypes: float64(7), int64(2), object(3)
memory usage: 66.8+ KB


Times

In [33]:
times_most_recent_data = times_df.iloc[times_df.groupby('university_name')['year'].idxmax()].copy()

In [34]:
times_most_recent_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 818 entries, 2405 to 2134
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              818 non-null    int64  
 1   world_rank              818 non-null    object 
 2   university_name         818 non-null    object 
 3   country                 818 non-null    object 
 4   teaching                818 non-null    float64
 5   international           818 non-null    object 
 6   research                818 non-null    float64
 7   citations               818 non-null    float64
 8   income                  818 non-null    object 
 9   total_score             818 non-null    object 
 10  num_students            796 non-null    object 
 11  student_staff_ratio     796 non-null    float64
 12  international_students  793 non-null    object 
 13  female_male_ratio       742 non-null    object 
 14  year                    818 non-null  

World  

In [35]:
world_most_recent_data = world_df.iloc[world_df.groupby('university_name')['year'].idxmax()].copy()

In [36]:
world_most_recent_data.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1024 entries, 1981 to 1838
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0            1024 non-null   int64  
 1   world_rank            1024 non-null   int64  
 2   university_name       1024 non-null   object 
 3   country               1024 non-null   object 
 4   national_rank         1024 non-null   int64  
 5   quality_of_education  1024 non-null   int64  
 6   alumni_employment     1024 non-null   int64  
 7   quality_of_faculty    1024 non-null   int64  
 8   publications          1024 non-null   int64  
 9   influence             1024 non-null   int64  
 10  citations             1024 non-null   int64  
 11  broad_impact          1023 non-null   float64
 12  patents               1024 non-null   int64  
 13  score                 1024 non-null   float64
 14  year                  1024 non-null   int64  
dtypes: float64(2), int

### Creo nuova colona con nomi di universita puliti (spazi,virgole,tutti minuscoli ect) cosi combaciano

In [37]:
def cleanNames(name):
    cleanname =name.lower().replace("'", "").replace(" ", "").replace(",", "").replace("-", "")
    return cleanname

In [38]:
shanghai_most_recent_data['CleanUniNames'] = shanghai_most_recent_data.university_name.apply(cleanNames)

In [39]:
times_most_recent_data['CleanUniNames'] = times_most_recent_data.university_name.apply(cleanNames)


In [40]:
world_most_recent_data['CleanUniNames'] = world_most_recent_data.university_name.apply(cleanNames)

### Prendo solo le colone che mi interessano

In [41]:
shanghai_sh=shanghai_most_recent_data[['CleanUniNames','university_name','world_rank']].rename(columns={"world_rank" : "world_rank_shanghai"})

In [42]:
print(shanghai_sh)

               CleanUniNames           university_name world_rank_shanghai
4696       aalborguniversity        Aalborg University             301-400
4796         aaltouniversity          Aalto University             401-500
4468        aarhusuniversity         Aarhus University                  73
4496  aixmarseilleuniversity  Aix Marseille University             101-150
3115  aixmarseilleuniversity  Aix-Marseille University             102-150
...                      ...                       ...                 ...
3512     yamaguchiuniversity      Yamaguchi University             401-500
4694       yeshivauniversity        Yeshiva University             201-300
4695        yonseiuniversity         Yonsei University             201-300
4395          yorkuniversity           York University             401-500
4545      zhejianguniversity       Zhejiang University             101-150

[658 rows x 3 columns]


In [43]:
times_sh=times_most_recent_data[['CleanUniNames','world_rank']].rename(columns={"world_rank" : "world_rank_times"})

In [44]:
world_sh=world_most_recent_data[['CleanUniNames','world_rank']].rename(columns={"world_rank" : "world_rank_world"})

### Inner join, prendo solo i nomi in comune

In [45]:
JoinTableFirst = shanghai_sh.join(times_sh.set_index('CleanUniNames'), on='CleanUniNames', how='inner')

In [46]:
JoinTable = JoinTableFirst.join(world_sh.set_index('CleanUniNames'), on='CleanUniNames', how='inner')

In [47]:
JoinTable

Unnamed: 0,CleanUniNames,university_name,world_rank_shanghai,world_rank_times,world_rank_world
4696,aalborguniversity,Aalborg University,301-400,201-250,565
4796,aaltouniversity,Aalto University,401-500,251-300,421
4468,aarhusuniversity,Aarhus University,73,106,122
4496,aixmarseilleuniversity,Aix Marseille University,101-150,251-300,206
3115,aixmarseilleuniversity,Aix-Marseille University,102-150,251-300,206
...,...,...,...,...,...
4406,yaleuniversity,Yale University,11,12,11
4694,yeshivauniversity,Yeshiva University,201-300,164,171
4695,yonseiuniversity,Yonsei University,201-300,301-350,98
4395,yorkuniversity,York University,401-500,301-350,337


### divido i range in due colone

In [48]:
def splitRange(columnValue):
      result=columnValue.split('-')
      if len(result) >1 :
                return [int(result[0]),int(result[1])]
      else:
                return [int(result[0]),int(result[0])]

In [49]:
Mylist =[]
def FindDiff(col1,col2,col3):
    
    Mylist.extend(splitRange(col1))
    Mylist.extend(splitRange(col2))
    Mylist.extend(splitRange(str(col3)))
    Mylist.sort()
    min1 = int(Mylist[0])
    max1 =int(Mylist[-1])
    Mylist.clear()
    return max1-min1

In [50]:
JoinTable["MaxDiff"]=JoinTable.apply(lambda x: FindDiff(x.world_rank_shanghai, x.world_rank_times,x.world_rank_world), axis=1 )

In [51]:
FinalTable = JoinTable[['university_name','world_rank_shanghai','world_rank_times', 'world_rank_world', 'MaxDiff']]

In [52]:
FinalTable.head(5)

Unnamed: 0,university_name,world_rank_shanghai,world_rank_times,world_rank_world,MaxDiff
4696,Aalborg University,301-400,201-250,565,364
4796,Aalto University,401-500,251-300,421,249
4468,Aarhus University,73,106,122,49
4496,Aix Marseille University,101-150,251-300,206,199
3115,Aix-Marseille University,102-150,251-300,206,198



## 5. Consider only the most recent data point of the times dataset. Compute the number of male and female students for each country.

In [53]:
times_most_recent_data.num_students

2405    35,569
2003    17,422
2056    16,099
1908    23,895
2105     9,252
         ...  
1856     2,400
2013     2,218
1904     2,429
1833     9,666
2134     8,788
Name: num_students, Length: 818, dtype: object

In [54]:
times_most_recent_data.num_students.hasnans

True

#### Nota: vedo che la colona num_students usa la virgoa come seperatore di mille , percui gli tolgo

In [55]:
times_most_recent_data['num_students']=times_most_recent_data.num_students.str.replace(',','').astype(float)

#### Controllo la collona female_male_ratio

In [56]:
times_most_recent_data.female_male_ratio.hasnans

True

In [57]:
times_most_recent_data.female_male_ratio.values

array(['-', '48 : 52', '32 : 68', '54 : 46', '48 : 52', '71 : 29',
       '61 : 39', '33 : 67', '46 : 54', '63 : 37', '17 : 83', '60 : 40',
       '51 : 49', '48 : 52', '34 : 66', nan, '78 : 22', '36 : 64',
       '52 : 48', '50 : 50', '53 : 47', nan, '47 : 53', '51 : 49',
       '58 : 42', '47 : 53', '52 : 48', '59 : 41', '56 : 44', '67 : 33',
       '55 : 45', '59 : 41', '23 : 77', '46 : 54', nan, '69 : 31',
       '49 : 51', '57 : 43', nan, '48 : 52', '55 : 45', '22 : 78',
       '55 : 45', '54 : 46', '58 : 42', '53 : 47', '50 : 50', '55 : 45',
       '27 : 73', nan, '46 : 54', '32 : 68', '47 : 53', '33 : 67',
       '58 : 42', '70 : 30', '57 : 43', '47 : 53', '43 : 57', '39 : 61',
       '48 : 52', nan, '66 : 34', '59 : 41', '30 : 70', '42 : 58',
       '64 : 36', '67 : 33', '62 : 38', '19 : 81', '37 : 63', '57 : 43',
       '53 : 47', nan, nan, '53 : 47', '45 : 55', nan, '41 : 59',
       '59 : 41', '41 : 59', '45 : 55', nan, '55 : 45', '51 : 49',
       '50 : 50', '46 : 54', '26 

#### Vedo che ha valori  "-" e valori "nan" , percui gli devo ignorare

In [58]:
times_most_recent_data.female_male_ratio

2405          -
2003    48 : 52
2056    32 : 68
1908    54 : 46
2105    48 : 52
         ...   
1856    46 : 54
2013    49 : 51
1904    18 : 82
1833    27 : 73
2134    61 : 39
Name: female_male_ratio, Length: 818, dtype: object

In [59]:
times_most_recent_data['first_condition'] = times_most_recent_data['female_male_ratio'].str.contains('-')==True
times_most_recent_data['second_condition'] = times_most_recent_data['female_male_ratio'].isnull()
times_most_recent_data['third_condition'] = times_most_recent_data['num_students'].isnull()

### Costruisco due colone uno per numeri di maschi e uno per numero di femmine

In [60]:
def CalculateFemale(row):
    if( row["first_condition"] | row["second_condition"] | row["third_condition"]):
        return None
    
    femaleratio = int(row["female_male_ratio"].split(':')[0])
    totalnrStudents = row["num_students"]
    return round((femaleratio/100) *totalnrStudents,0)
    

In [61]:
times_most_recent_data["nr_female"]= times_most_recent_data.apply(CalculateFemale , axis=1)

In [62]:
# Il numero di maschi è la differenza tra numero totale e numero di femmine
times_most_recent_data["nr_male"]=times_most_recent_data.num_students - times_most_recent_data.nr_female

In [63]:
times_most_recent_data[["country","nr_female","nr_male","num_students","female_male_ratio"]]

Unnamed: 0,country,nr_female,nr_male,num_students,female_male_ratio
2405,Poland,,,35569.0,-
2003,Denmark,8363.0,9059.0,17422.0,48 : 52
2056,Finland,5152.0,10947.0,16099.0,32 : 68
1908,Denmark,12903.0,10992.0,23895.0,54 : 46
2105,United Kingdom,4441.0,4811.0,9252.0,48 : 52
...,...,...,...,...,...
1856,France,1104.0,1296.0,2400.0,46 : 54
2013,France,1087.0,1131.0,2218.0,49 : 51
1904,France,437.0,1992.0,2429.0,18 : 82
1833,Switzerland,2610.0,7056.0,9666.0,27 : 73


In [64]:
lista =["nr_female","nr_male","num_students"]

In [65]:
# raggruppo per country e faccio la somma

female_male_df = times_most_recent_data.groupby('country', as_index=False)[lista].sum()

In [66]:
female_male_df.tail(16)

Unnamed: 0,country,nr_female,nr_male,num_students
56,Slovenia,29293.0,19528.0,63352.0
57,South Africa,202378.0,127820.0,330198.0
58,South Korea,132803.0,160880.0,442372.0
59,Spain,447480.0,382851.0,830331.0
60,Sweden,72958.0,59664.0,187293.0
61,Switzerland,58477.0,60499.0,130940.0
62,Taiwan,126448.0,170503.0,308940.0
63,Thailand,111675.0,98447.0,210122.0
64,Turkey,410518.0,228824.0,639342.0
65,Uganda,18670.0,18670.0,37340.0


### Note : Vedo che 2 nomi di universita sono scritti male percui gli elimino

In [67]:
female_male_df.drop([67,71], inplace=True)

## 6. Find the universities where the ratio between female and male is below the average ratio (computed over all universities)

In [68]:
def Calcolate_ratio(row):
    if math.isnan(row["nr_male"]) or math.isnan(row["nr_female"]) or row["nr_male"] == 0:
        return None
    else:
        return round((row["nr_female"]/row["nr_male"])*100)

In [69]:
times_most_recent_data["Ratio"] = times_most_recent_data.apply(Calcolate_ratio,axis=1)

In [70]:
times_most_recent_data.Ratio

2405      NaN
2003     92.0
2056     47.0
1908    117.0
2105     92.0
        ...  
1856     85.0
2013     96.0
1904     22.0
1833     37.0
2134    156.0
Name: Ratio, Length: 818, dtype: float64

In [71]:
avarage_ratio = round(times_most_recent_data.Ratio.mean(),2)

In [72]:
avarage_ratio

108.05

In [73]:
female_male_below_ratio = times_most_recent_data[times_most_recent_data.Ratio < avarage_ratio]

In [74]:
female_male_below_ratio[["university_name","Ratio"]]

Unnamed: 0,university_name,Ratio
2003,Aalborg University,92.0
2056,Aalto University,47.0
2105,Aberystwyth University,92.0
2406,Ajou University,49.0
2408,Alexandria University,85.0
...,...,...
2104,Zhejiang University,69.0
1856,École Normale Supérieure,85.0
2013,École Normale Supérieure de Lyon,96.0
1904,École Polytechnique,22.0


## 7. For each country, compute the fraction of the students in the country that are in one of the universities computed in the previous point (that is, the denominator of the ratio is the total number of students over all universities in the country).

In [75]:
# Number of total student for country i have it from "female_male_df"
female_male_df.head()

Unnamed: 0,country,nr_female,nr_male,num_students
0,Argentina,67191.0,41182.0,108373.0
1,Australia,391736.0,321640.0,743627.0
2,Austria,68364.0,66113.0,134477.0
3,Bangladesh,21323.0,41393.0,62716.0
4,Belarus,20219.0,9084.0,29303.0


In [76]:
list_of_columns=["num_students","country"]
students_below_ratio=female_male_below_ratio.groupby('country', as_index=False)[list_of_columns].sum()

In [77]:
ratio_below_and_total = pd.merge(female_male_df, students_below_ratio, how="inner", on=["country"])
ratio_below_and_total["Ratio"] = round((ratio_below_and_total.num_students_y/ratio_below_and_total.num_students_x)*100,2)

In [78]:
#rename "num_students_x" to "num_students_total"  and "num_students_y" to "num_students_below"
ratio_below_and_total = ratio_below_and_total.rename(columns={'num_students_x': 'num_students_total', 'num_students_y': 'num_students_below'})

In [79]:
ratio_below_and_total.head(10)

Unnamed: 0,country,nr_female,nr_male,num_students_total,num_students_below,Ratio
0,Australia,391736.0,321640.0,743627.0,160839.0,21.63
1,Austria,68364.0,66113.0,134477.0,61033.0,45.39
2,Bangladesh,21323.0,41393.0,62716.0,62716.0,100.0
3,Brazil,247271.0,246980.0,534688.0,375248.0,70.18
4,Canada,344440.0,279189.0,717054.0,86779.0,12.1
5,Chile,37962.0,52989.0,116026.0,90951.0,78.39
6,China,427842.0,587308.0,1285619.0,918214.0,71.42
7,Colombia,26376.0,28269.0,54645.0,54645.0,100.0
8,Czech Republic,97971.0,77900.0,203321.0,54324.0,26.72
9,Denmark,62748.0,56607.0,119355.0,67915.0,56.9


## 8. Read the file educational_attainment_supplementary_data.csv, discarding any row with missing country_name or series_name

In [80]:
educational_attainment_supplementary =readDataSet(educational_attainment_supplementary_data)

In [81]:
index_to_drop=educational_attainment_supplementary[(educational_attainment_supplementary.country_name.isnull() ==True) | (educational_attainment_supplementary.series_name.isnull() ==True) ].index.values

In [82]:
educational_attainment_supplementary.drop(index_to_drop, inplace=True)


## 9. From attainment build a dataframe with the same data, but with 4 columns: country_name, series_name, year, value

In [83]:
value_vars= ['1985', '1986', '1987', '1990', '1991',
       '1992', '1993', '1995', '1996', '1997', '1998', '1999', '2000', '2001',
       '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010',
       '2011', '2012', '2013', '2015']
id_vars=['country_name', 'series_name']

In [84]:
unpivot_attainment=pd.melt(educational_attainment_supplementary,id_vars=id_vars , value_vars=value_vars, var_name='year', value_name='value')

In [85]:
unpivot_attainment.head(5)

Unnamed: 0,country_name,series_name,year,value
0,Afghanistan,"Barro-Lee: Average years of primary schooling,...",1985,0.33
1,Afghanistan,"Barro-Lee: Average years of primary schooling,...",1985,1.03
2,Afghanistan,"Barro-Lee: Average years of primary schooling,...",1985,0.83
3,Afghanistan,"Barro-Lee: Average years of primary schooling,...",1985,2.34
4,Afghanistan,"Barro-Lee: Average years of primary schooling,...",1985,0.54


## 10. For each university, find the number of rankings in which they appear (it suffices to appear in one year for each ranking).

### nota : normalizo i nomi di universita utilizando la funzione cleanNames

In [86]:
times_df["uni_name_normalazed"] = times_df.university_name.apply(cleanNames)

In [87]:
shanghai_df["uni_name_normalazed"]= shanghai_df.university_name.apply(cleanNames)

In [88]:
world_df["uni_name_normalazed"] = world_df.university_name.apply(cleanNames)

In [89]:
world_df.uni_name_normalazed

0                        harvarduniversity
1       massachusettsinstituteoftechnology
2                       stanforduniversity
3                    universityofcambridge
4          californiainstituteoftechnology
                       ...                
2195                universityofthealgarve
2196                  alexandriauniversity
2197              federaluniversityofceará
2198                   universityofacoruña
2199         chinapharmaceuticaluniversity
Name: uni_name_normalazed, Length: 2200, dtype: object

In [90]:
### procedo a prendere i nomi di universta per ciascun dataframe

In [91]:
times_df

Unnamed: 0.1,Unnamed: 0,world_rank,university_name,country,teaching,international,research,citations,income,total_score,num_students,student_staff_ratio,international_students,female_male_ratio,year,uni_name_normalazed
0,0,1,Harvard University,United States of America,99.7,72.4,98.7,98.8,34.5,96.1,20152,8.9,25%,,2011,harvarduniversity
1,1,2,California Institute of Technology,United States of America,97.7,54.6,98.0,99.9,83.7,96.0,2243,6.9,27%,33 : 67,2011,californiainstituteoftechnology
2,2,3,Massachusetts Institute of Technology,United States of America,97.8,82.3,91.4,99.9,87.5,95.6,11074,9.0,33%,37 : 63,2011,massachusettsinstituteoftechnology
3,3,4,Stanford University,United States of America,98.3,29.5,98.1,99.2,64.3,94.3,15596,7.8,22%,42 : 58,2011,stanforduniversity
4,4,5,Princeton University,United States of America,90.9,70.3,95.4,99.9,-,94.2,7929,8.4,27%,45 : 55,2011,princetonuniversity
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2598,2598,601-800,Yeungnam University,South Korea,18.6,24.3,10.9,26.5,35.4,-,21958,15.3,3%,48 : 52,2016,yeungnamuniversity
2599,2599,601-800,Yıldız Technical University,Turkey,14.5,14.9,7.6,19.3,44.0,-,31268,28.7,2%,36 : 64,2016,yıldıztechnicaluniversity
2600,2600,601-800,Yokohama City University,Japan,24.0,16.1,10.2,36.4,37.9,-,4122,3.7,3%,,2016,yokohamacityuniversity
2601,2601,601-800,Yokohama National University,Japan,20.1,23.3,16.0,13.5,40.4,-,10117,12.1,8%,28 : 72,2016,yokohamanationaluniversity


In [92]:
times_uni = times_df[["university_name","uni_name_normalazed"]].groupby("uni_name_normalazed").max()

In [93]:
times_uni

Unnamed: 0_level_0,university_name
uni_name_normalazed,Unnamed: 1_level_1
aalborguniversity,Aalborg University
aaltouniversity,Aalto University
aarhusuniversity,Aarhus University
aberystwythuniversity,Aberystwyth University
adammickiewiczuniversity,Adam Mickiewicz University
...,...
écolenormalesupérieure,École Normale Supérieure
écolenormalesupérieuredelyon,École Normale Supérieure de Lyon
écolepolytechnique,École Polytechnique
écolepolytechniquefédéraledelausanne,École Polytechnique Fédérale de Lausanne


In [94]:
shanghai_uni = shanghai_df[["university_name","uni_name_normalazed"]].groupby("uni_name_normalazed").max()

In [95]:
world_uni = world_df[["university_name","uni_name_normalazed"]].groupby("uni_name_normalazed").max()

In [96]:
all_uni = times_uni.append(shanghai_uni).append(world_uni)

In [97]:
all_uni_group=all_uni.groupby("uni_name_normalazed").count()

In [98]:
#i want to take back normal university name so i make a join
all_uni_count_merge = pd.merge(all_uni,all_uni_group, on="uni_name_normalazed")

In [99]:
# i have duplicates so i remove them, take off index, and rename all
all_uni_count=all_uni_count_merge.drop_duplicates().reset_index()\
                .drop("uni_name_normalazed" ,axis=1)\
                .rename(columns={'university_name_x': 'university_name','university_name_y':'count'}) 

In [100]:
all_uni_count.head(5)

Unnamed: 0,university_name,count
0,Aalborg University,3
1,Aalto University,3
2,Aarhus University,3
3,Aberystwyth University,2
4,Adam Mickiewicz University,1


## 11. In the times ranking, compute the number of times each university appears

In [101]:
times_count_uni=times_df.copy()

In [102]:
times_count_uni_pc =times_count_uni[["university_name","uni_name_normalazed"]]\
                    .groupby("university_name" , as_index=False).count() \
                     .rename(columns={'uni_name_normalazed': 'count'}) 

In [103]:
times_count_uni_pc.head(10)

Unnamed: 0,university_name,count
0,AGH University of Science and Technology,1
1,Aalborg University,5
2,Aalto University,5
3,Aarhus University,6
4,Aberystwyth University,5
5,Adam Mickiewicz University,1
6,Aix-Marseille University,1
7,Ajou University,1
8,Alexandria University,3
9,Alexandru Ioan Cuza University,1


## 12. Find the universities that appear at most twice in the times ranking.

In [104]:
uni_most_twice = times_count_uni_pc[times_count_uni_pc['count'] >= 2]

In [105]:
uni_most_twice.head(5)

Unnamed: 0,university_name,count
1,Aalborg University,5
2,Aalto University,5
3,Aarhus University,6
4,Aberystwyth University,5
8,Alexandria University,3


## 13. The universities that, in any year, have the same position in all three rankings (they must have the same position in a year).

In [106]:
shanghai_no_range = shanghai_df[shanghai_df['world_rank'].str.contains('-')==False]

In [107]:
times_no_range = times_df[times_df['world_rank'].str.contains('-')==False]

In [108]:
shanghai_no_range =shanghai_no_range[["uni_name_normalazed","university_name","world_rank","year"]]

In [109]:
shanghai_no_range.world_rank = shanghai_no_range.world_rank.astype('int')

In [110]:
times_no_range=times_no_range[["uni_name_normalazed","university_name","world_rank","year"]]

In [111]:
times_no_range.world_rank=times_no_range.world_rank.astype('int')

In [112]:
worlddf=world_df[world_df["world_rank"].isnull() == False][["uni_name_normalazed","university_name","world_rank","year"]]

In [113]:
first_union_df =pd.merge(shanghai_no_range,times_no_range,on=["uni_name_normalazed","year"])\
                            .drop("university_name_y",axis=1)\
                             .rename(columns={'university_name_x': 'university','world_rank_x':'shanghai_rank','world_rank_y':'times_rank'}) 

In [114]:
first_union_df

Unnamed: 0,uni_name_normalazed,university,shanghai_rank,year,times_rank
0,harvarduniversity,Harvard University,1,2011,1
1,stanforduniversity,Stanford University,2,2011,4
2,universityofcaliforniaberkeley,"University of California, Berkeley",4,2011,8
3,universityofcambridge,University of Cambridge,5,2011,6
4,californiainstituteoftechnology,California Institute of Technology,6,2011,2
...,...,...,...,...,...
316,mcmasteruniversity,McMaster University,96,2015,94
317,universityofbonn,University of Bonn,97,2015,195
318,vuuniversityamsterdam,VU University Amsterdam,98,2015,136
319,michiganstateuniversity,Michigan State University,99,2015,82


In [115]:
same_position = pd.merge(first_union_df,worlddf, on=["uni_name_normalazed","year"] )\
                        .drop(['university_name','uni_name_normalazed'],axis=1)

In [116]:
result = same_position[(same_position['shanghai_rank'] == same_position['times_rank'])\
                              & (same_position['times_rank'] == same_position['world_rank']) ]

In [117]:
result

Unnamed: 0,university,shanghai_rank,year,times_rank,world_rank
51,Stanford University,2,2013,2,2
