In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Investegating the problem

After spending some times searching on `How the Universities are ranked`.  

I found several factors judging the universities rank based on different factors and visions **From wikipedia's [College and university rankings](https://en.wikipedia.org/wiki/College_and_university_rankings)**  
- `Global research reputation`, `publications`, and the `number of highly cited papers`.  
- `Human Resources & Labor Review`: measure the performance of top 300 universities' graduates, `HRLR` remains to be the `leader in university ranking` with innovative and comprehensive approaches  
- `Nature Index`: tracks the affiliations of `high-quality scientific articles` published in 68 science journals independently chosen by the scientific community as the journals scientists would most like to publish their best research in.   
- `G-factor`: in social network theory terminology, G-factor measures the `centrality of each university's website` in the network of university websites  
- `Regional and national rankings`: carried out in Africa, Asia, Europe, North America, South America and Oceania.  
- [Forbes College rankings](https://en.wikipedia.org/wiki/College_and_university_rankings#:~:text=Forbes%20College%20rankings): `Student satisfaction` constitutes 25% of the score, `Post-graduate success` 32.5% of the score, `Student debt loads` 25% of the score, `Graduation rate` 7.5% of the score, `Academic success` 10% of the score.  

Also... after spending some time look in the data files, I decied to eliminate the data file :  
- **educational_attainment_supplementary_data.csv** for Not having usefull data and containing **much NaN values** according to the DataSet description




In [None]:
# modules we'll use
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


# read in all our data
education_expenditure_supplementary_data = pd.read_csv("../input/world-university-rankings/education_expenditure_supplementary_data.csv" ,engine='python')
education_expenditure_supplementary_data.name='education_expenditure_supplementary_data'
shanghaiData = pd.read_csv("/kaggle/input/world-university-rankings/shanghaiData.csv")
shanghaiData.name='shanghaiData'
timesData= pd.read_csv("/kaggle/input/world-university-rankings/timesData.csv")
timesData.name='timesData'
cwurData = pd.read_csv("/kaggle/input/world-university-rankings/cwurData.csv")
cwurData.name='cwurData'
school_and_country_table= pd.read_csv("/kaggle/input/world-university-rankings/school_and_country_table.csv")
school_and_country_table.name='school_and_country_table'

# Investegating the Data
**Detailed Missing Values counts**

In [None]:
Data_List=[education_expenditure_supplementary_data,shanghaiData,timesData,cwurData,school_and_country_table]
for x in Data_List:
    # how many total missing values do we have?
    #totale values in our dataset
    total_cells = np.product(x.shape)
    #description of  missing values in each column
    missing_values_count = x.isnull().sum()
    print("Data Frame :",'\033[1m' + x.name + '\033[0m')
    #print('Data Frame :',x.name)
    print(missing_values_count.sort_values(ascending=False))
    #totale missing values in our dataset
    total_missing = missing_values_count.sum()
    # percent of data that is missing
    percent_missing = (total_missing/total_cells) * 100
    print('Data shape :',x.shape[0],'by', x.shape[1])
    print('% of data that is missing :',"{:.2f}".format(percent_missing),'\n')

# Education Expenditure Supplementary Data
**Detailed review**

In [None]:
education_expenditure_supplementary_data.head(5)

In [None]:
education_expenditure_supplementary_data.describe(include='all')

for this data frame , I would like to see the ratio of the institue type per country and the relationship between these two and the direct expenditure.  
As for the rest of columns they have so much missing data ( 69%+ missing values ). In this case, dropping them would be much more usefull then processing them with no evidence of having an impact on the dataframe it self

**Cleaning the Data**

In [None]:
cols_with_missing = [col for col in education_expenditure_supplementary_data.columns
                     if education_expenditure_supplementary_data[col].isnull().any()]
print('dropped Columns :',cols_with_missing)


education_expenditure_supplementary_data = education_expenditure_supplementary_data.drop(cols_with_missing, axis=1)

In [None]:
education_expenditure_supplementary_data.head()

In [None]:
education_expenditure_supplementary_data['country'] = education_expenditure_supplementary_data['country'].str.strip()
countries = education_expenditure_supplementary_data['country'].unique()
# sort them alphabetically and then take a closer look
countries.sort()
countries

Could'nt find out how to plot the count of institue type per country tried different kind of plot but they all went wrong.
Please if you have any suggestion leave a comment.

# shanghai Data

In [None]:
shanghaiData.head(10)

In [None]:
shanghaiData.shape

Since ShanghaiData does'nt have much NaN values in most of its column ,`changing these NaNs` with `0` wont miss up the quality of the data ( except for `total_score` column it will be dropped because it has so much missing data `almost 70% of the column is missing` )

In [None]:
#dropping Totale score column with the most missing values (70%+)
shanghaiData=shanghaiData.drop(['total_score'], axis = 1)
#replacing all the NaN's in the shanghaiData data with the one that comes directly after it 
#and then replacing any remaining NaN's with 0
shanghaiData = shanghaiData.fillna(method='bfill', axis=0).fillna(0)

**The Reason why i replaced the NaNs with the next value** is that the shanghaiData is already ranked so the scores values in the column are close to each other ( i guess, but not pretty sure i just took a look on them xD, like 99.8 is pretty close to 100 right ? ) 

In [None]:
shanghaiData.describe()

In [None]:
# Number of missing values in each column of data
missing_val_count_by_column = (shanghaiData.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

Creating new `Score` Column that has the mean value of the rest of the scoring columns which will give a better score.

In [None]:
shanghaiData.columns

In [None]:
shanghaiData_features = ['alumni', 'award',
       'hici', 'ns', 'pub', 'pcp']
shanghaiData["Score"] = (shanghaiData[shanghaiData_features].sum(axis=1)/len(shanghaiData_features))*0.1
shanghaiData["Score_rank"]=(10-shanghaiData["Score"]).apply(np.ceil)
shanghaiData.award=shanghaiData.award*0.1
shanghaiData.head(10)

**Ploting Shanghai Data**

In [None]:
def linelivechart(data,tr_1,tr_2):
    # prepare data frame
    df = data.iloc[:10,:]

    # import graph objects as "go"
    import plotly.graph_objs as go
    from plotly.offline import iplot

    # Creating trace1
    trace1 = go.Scatter(
                        x = df[tr_1],
                        y = df[tr_2],
                        mode = "lines+markers",
                        name = "Score rank",
                        marker = dict(color = 'rgba(16, 112, 2, 0.8)'),
                        text= df.university_name)
    # Creating trace2
    trace2 = go.Scatter(
                        x = df.world_rank,
                        y = df.national_rank,
                        mode = "lines+markers",
                        name = "National Rank",
                        marker = dict(color = 'rgba(80, 26, 80, 0.8)'),
                        text= df.university_name)
    data = [trace1, trace2]
    layout = dict(title = 'New Score Rank and national rank vs World Rank of Top 10 Universities',
                  xaxis= dict(title= 'World Rank',ticklen= 5,zeroline= True)
                 )
    fig = dict(data = data, layout = layout)
    iplot(fig)

In [None]:
linelivechart(shanghaiData,'world_rank', 'Score_rank')

# Times Data

In [None]:
timesData.head(10)

In [None]:
timesData.shape

**Cleaning the Data**  
Just as before this dataFrame is already ranked as the previous one ( ShanghaiData ) so the`Female_male_ratio` column that has the most `missing` values ( `233 out of 2603 by 14` ) is already sorted.  
what I mean is, if you take a close look on this missing value you will find out that they are close to each other like Rank 1, 2 and 3 universities ( NaN , 33:67, 33:67 ) so it is best ( in my opinion ) to replace the `NaNs` with the next value.
Simirlary with the `international_students`  `num_students` and `student_staff_ratio` for the same reason mentioned before.  
**PN**: I know it is the same approach used in the shanghaiData but i could'nt find any better, maybe because I lack experience which is True, but i m doing this to get Exp. **Please if you have a better approach drop a comment**.  
Thank You!

In [None]:
#replacing all the NaN's in the timesData data with the one that comes directly after it 
timesData = timesData.fillna(method='bfill', axis=0)
# Number of missing values in each column of data
missing_val_count_by_column = (timesData.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

**Great!** The Data is fully cleaned and ready to be used!  
But.. it requires more processing

In [None]:
timesData.head(10)

**Problem :**  
When trying to split the `female to male ratio` column, I found it that there are some rows with `-` , these are just some `FAKE NANs` so best thing to do here it to replace them to NAN value then get ride of them but replacing the new NaNs with the values that comes directly after it just like what I did before.

In [None]:
#Replacing '-' with NaN
timesData=timesData.replace('-', np.NaN)
#replacing all the NaN's in the timesData with the one that comes directly after it 
timesData = timesData.fillna(method='bfill', axis=0)

#removing white space from left and right!
timesData['female_male_ratio'] = timesData['female_male_ratio'].str.strip()
# new data frame with split value columns
new = timesData["female_male_ratio"].str.split(":", n = 1, expand = True)
# making separate female ratio from new data frame and making sure no white space exist,
#making sure it has a good datatype for ploting

timesData["Female_ratio"]= new[0]  
timesData["Female_ratio"]=timesData['Female_ratio'].str.strip()

#changing column type
timesData = timesData.astype({"Female_ratio": int})
# making separate male ratio from new data frame and making sure no white space exist ,
#making sure it has a good datatype for ploting
timesData["male_ratio"]= new[1]  
timesData["male_ratio"]=timesData['male_ratio'].str.strip()
#changing column type
timesData = timesData.astype({"male_ratio": int})
# Dropping old female_male_ratio columns
timesData.drop(columns =["female_male_ratio"], inplace = True)
timesData.head()

**Other small Problem .... :**  
We must get ride of the `%` in the `international_students` column and make it Int dType.

In [None]:
timesData['international_students'] = timesData['international_students'].str.strip()
new2 = timesData["international_students"].str.split("%", n = 1, expand = True)
timesData["international_students"]= new2[0]
timesData["international_students"]=timesData['international_students'].str.strip()
timesData = timesData.astype({"international_students": int})

In [None]:
timesData.rename(columns={'international_students': 'international_students % ', 'Female_ratio': 'Female_ratio % ', 'male_ratio': 'male_ratio % '}, inplace=True)
timesData.head()

In [None]:
#Checking the datatype of the each column of the timesData DataFrame to make sure we are looking good so far ....
dataTypeSeries = timesData.dtypes
print('Data type of each column of timesData Dataframe :')
print(dataTypeSeries)

international , income ,total_score and num_students must be int or float too .......


In [None]:
timesData = timesData.astype({"international": float , "income":float , "total_score":float})

`num_students` column is an exceptional case, because it has `','` instead of `'.'`

In [None]:
timesData["num_students"]=timesData["num_students"].str.replace(',','.')
timesData["num_students"] = timesData["num_students"].apply(pd.to_numeric)

In [None]:
#Checking the datatype of the each column of the timesData DataFrame to make sure we are looking good so far ....
#Again ..............
dataTypeSeries = timesData.dtypes
print('Data type of each column of timesData Dataframe :')
print(dataTypeSeries)

In [None]:
timesData.head()

**DONE!**  
Time for ploting

**Ploting TimesData**

In [None]:
def linelivecharttimesData(data,tr_1,tr_2,tr_3):
    # prepare data frame
    df = data.iloc[:20,:]

    # import graph objects as "go"
    import plotly.graph_objs as go
    from plotly.offline import iplot

    # Creating trace1
    trace1 = go.Scatter(
                        x = df[tr_1],
                        y = df[tr_2],
                        mode = "lines+markers",
                        name = "teaching",
                        marker = dict(color = 'rgba(16, 112, 2, 0.8)'),
                        text= df.university_name)
    # Creating trace2
    trace2 = go.Scatter(
                        x = df.world_rank,
                        y = df.research,
                        mode = "lines+markers",
                        name = "research",
                        marker = dict(color = 'rgba(80, 26, 80, 0.8)'),
                        text= df.university_name)
    # Creating trace3
    trace3 = go.Scatter(
                        x = df[tr_1],
                        y = df[tr_3],
                        mode = "lines+markers",
                        name = "income",
                        marker = dict(color = 'rgba(36,120,153,.4)'),
                        text= df.university_name)
    data = [trace1, trace2 , trace3]
    layout = dict(title = 'teaching, research and income vs world_rank of Top 20 Universities',
                  xaxis= dict(title= 'World Rank',ticklen= 5,zeroline= True)
                 )
    fig = dict(data = data, layout = layout)
    iplot(fig)

In [None]:
linelivecharttimesData(timesData,'world_rank', 'teaching','income')

In [None]:
def linelivecharttimesData(data,tr_1,tr_2,tr_3,tr_4):
    # prepare data frame
    df = data.iloc[:20,:]

    # import graph objects as "go"
    import plotly.graph_objs as go
    from plotly.offline import iplot

    # Creating trace1
    trace1 = go.Scatter(
                        x = df[tr_1],
                        y = df[tr_2],
                        mode = "lines+markers",
                        name = "teaching",
                        marker = dict(color = 'rgba(16, 112, 2, 0.8)'),
                        text= df.university_name)
    # Creating trace2
    trace2 = go.Scatter(
                        x = df.world_rank,
                        y = df[tr_3],
                        mode = "lines+markers",
                        name = "citations",
                        marker = dict(color = 'rgba(80, 26, 80, 0.8)'),
                        text= df.university_name)
    # Creating trace3
    trace3 = go.Scatter(
                        x = df[tr_1],
                        y = df[tr_4],
                        mode = "lines+markers",
                        name = "research",
                        marker = dict(color = 'rgba(36,120,153,.4)'),
                        text= df.university_name)
    data = [trace1, trace2 , trace3]
    layout = dict(title = 'teaching ,citations and research vs world_rank of Top 20 Universities',
                  xaxis= dict(title= 'World Rank',ticklen= 5,zeroline= True)
                 )
    fig = dict(data = data, layout = layout)
    iplot(fig)

In [None]:
linelivecharttimesData(timesData,'world_rank', 'teaching','citations','research')

**Count of Universities per country**  
displying top 10 ranked universities's countries with most universities  
( as example : rank 1 Harvard University ' s Country : USA ===> display number of universities of USA )  
**Special thanks to** Mert Altunay [Notebooks Expert](https://www.kaggle.com/nihalbey)

In [None]:
import plotly.graph_objs as go
from plotly.offline import iplot
index = timesData["country"].value_counts().head(10).index
value = timesData["country"].value_counts().head(10).values
trace1 = go.Bar(
x = index,
y = value,
marker = {"color":"rgba(131,26,93,0.4)"}    
)
data4 = [trace1]

iplot(data4)

**Universities's ranks evoluation per year**

In [None]:
timesData.year.unique()

In [None]:
data2016 = timesData[timesData.year == 2016].iloc[:100,:]
data2015 = timesData[timesData.year == 2015].iloc[:100,:]
data2014 = timesData[timesData.year == 2014].iloc[:100,:]
data2013 = timesData[timesData.year == 2013].iloc[:100,:]
data2012 = timesData[timesData.year == 2012].iloc[:100,:]
data2011 = timesData[timesData.year == 2011].iloc[:100,:]

In [None]:
# creating trace for year 2011
trace_2011 = go.Scatter(x = data2011.world_rank, y = data2011.citations, mode = "markers", name = "2011", marker = dict(color = 'rgba(255, 128, 255, 0.8)'), text= data2011.university_name)
# creating trace for year 2012
trace_2012 = go.Scatter(x = data2012.world_rank, y = data2012.citations, mode = "markers", name = "2012", marker = dict(color = 'rgba(16, 112, 2, 0.8)'), text= data2012.university_name)
# creating trace for year 2013
trace_2013 = go.Scatter(x = data2013.world_rank, y = data2013.citations, mode = "markers", name = "2013", marker = dict(color = 'rgba(80, 26, 80, 0.8)'), text= data2013.university_name)
# creating trace for year 2014
trace_2014 = go.Scatter(x = data2014.world_rank, y = data2014.citations, mode = "markers", name = "201", marker = dict(color = 'rgba(36,120,153,.4)'), text= data2014.university_name)
# creating trace for year 2015
trace_2015 = go.Scatter(x = data2015.world_rank,y = data2015.citations,mode = "markers",name = "2015",marker = dict(color = 'rgba(255, 128, 2, 0.8)'),text= data2015.university_name)
# creating trace for year 2016
trace_2016 = go.Scatter(x = data2016.world_rank,y = data2016.citations, mode = "markers", name = "2016", marker = dict(color = 'rgba(0, 255, 200, 0.8)'),text= data2016.university_name)

In [None]:
# list of traces
data = [trace_2011, trace_2012, trace_2013, trace_2014, trace_2015, trace_2016]

layout = dict(title = 'Citation vs world rank of top 100 universities from 2011 to 2016',
              xaxis= dict(title= 'World Rank',ticklen= 5,zeroline= False),
              yaxis= dict(title= 'Citation',ticklen= 5,zeroline= False))
figure2 = dict(data = data, layout = layout)
iplot(figure2)

# cwurData

In [None]:
cwurData.head(10)

In [None]:
cwurData.shape

In [None]:
#Checking the datatype of the each column of the timesData DataFrame to make sure we are looking good so far ....
dataTypeSeries = cwurData.dtypes
print('Data type of each column of timesData Dataframe :')
print(dataTypeSeries)

**dropping the broad_impact column** with the most missing values


In [None]:
cwurData=cwurData.drop(['broad_impact'], axis = 1)

**Creating new Feature** based on the Forbes College rankings **check the description at the beginning of the notebook**, but with some modification ( my own prespective , just a point of view as a student) to adopt it to the dataframe  
the new feature **(Estimated_Score) formula**: rank for `score - (quality of education * 35% + rank for alumni employment* 45% + rank for influence * 20%)`

In [None]:
cwurData["Estimated_Score"]=cwurData["score"] - (cwurData["quality_of_education"] * 0.35 + cwurData["alumni_employment"] * 0.45 + cwurData["influence"] * 0.20 )

In [None]:
cwurData.head()

In [None]:
cwurData.year.unique()

In [None]:
fig, axs = plt.subplots(ncols=3 , figsize=(30,12))
sns.pointplot(data=cwurData.head(3), x='institution', y='world_rank',ax=axs[0]  ,fontsize=100)
sns.pointplot(data=cwurData.head(3), x='institution', y='score',ax=axs[1])
sns.pointplot(data=cwurData.head(3), x='institution', y='Estimated_Score',ax=axs[2])

**Wordcloud**

In [None]:
from wordcloud import WordCloud 
dataframe=cwurData['country'].to_string()
# Start with one review:
text = dataframe
# Create and generate a word cloud image:
wordcloud = WordCloud().generate(text)
# Display the generated image:
f,ax=plt.subplots(1,1,figsize=(25,5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

# Prediction: prediction of country's universities rank range, on **CwurData** 

Inspired by [Trupti Mamlekar](https://www.kaggle.com/truptimamlekar), link to his : [NoteBook](https://www.kaggle.com/truptimamlekar/worlduniversityranking)

In [None]:
cwurData.insert(14,"chances",0,True)
cwurData.head(1)

In [None]:
cwurData.loc[cwurData['score']> 20, ['chances']] = '1'
cwurData.loc[cwurData['score']< 20, ['chances']] = '0'

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
cwurData['country']=le.fit_transform(cwurData['country'])

In [None]:
y = cwurData.chances
cwurData_features = ['alumni_employment', 'publications', 'citations', 'national_rank', 
                        'quality_of_education']
X = cwurData[cwurData_features]

from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(X, y,test_size=0.2,random_state = 7)

In [None]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import classification_report, confusion_matrix

ran_class=RandomForestClassifier()
ran_class.fit(train_X,train_y)
ran_predict=ran_class.predict(val_X)
print(classification_report(val_y,ran_predict))
accuracy=ran_class.score(val_X,val_y)
print(accuracy*100,'%')
cm = confusion_matrix(val_y, ran_predict)
sns.heatmap(cm, annot= True)

In [None]:
train_score = ran_class.score(train_X,train_y)
test_score = ran_class.score(val_X,val_y)
print(f'Training Accuracy of our model is: {train_score}')
print(f'Test Accuracy of our model is: {test_score}')
print()
#prediction of being in top 20
prediction = ran_class.predict(train_X.iloc[15].values.reshape(1,-1))
actual_value = train_y.iloc[15]
print(f'Predicted Value \t: {prediction[0]}')
print(f'Actual Value\t\t: {actual_value}')