In [None]:
#Wranlging
import pandas as pd
import numpy as np

#Visualization
import matplotlib.pyplot as plt
import seaborn as sns

#Classification
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# List of Questions we would like to answer
Question 1: Identify Universities whose rank has been non-decreasing. <br>
Question 2: Which factors are more determining at rankings <br>
Question 3: Predict University Rankings based on different factors <br>

### But first, let's see some basic stats. 

In [None]:
rank = pd.read_csv('../input/cwurData.csv')

In [None]:
rank.info()

In [None]:
rank.describe()

In [None]:
list(rank.columns.values)

In [None]:
#Distinct Years
rank.year.unique()

In [None]:
#Count of Institution by country in 2015
ins_count = rank[rank['year'] == 2015].groupby('country').size().sort_values(ascending = False)
plt.figure(figsize = (15,10))
ax = sns.barplot(x = ins_count.index, y = ins_count.values)
ax.set(xlabel = 'Country', ylabel = 'Number of Institution')
plt.xticks(rotation = 70)
plt.show()

In [None]:
#Number of top 100 institution by country in 2015
top_count = rank[rank['year'] == 2015].head(100).groupby('country').size().sort_values(ascending = False)
plt.figure(figsize = (15,10))
ax = sns.barplot(x = top_count.index, y = top_count.values)
ax.set(xlabel = 'Country', ylabel = 'Number of top 100 Institution')
plt.xticks(rotation = 70)
plt.show()

In [None]:
#Percentage of top 100 institution by country in 2015
per_count = top_count/ins_count
per_count.dropna(inplace = True)
per_count.sort_values(ascending = False, inplace = True)
plt.figure(figsize = (15,10))
ax = sns.barplot(x = per_count.index, y = per_count.values)
ax.set(xlabel = 'Country', ylabel = 'Number of top 100 Institution')
plt.xticks(rotation = 70)
plt.show()

## Question 1: Identify Universities whose rank has been non-decreasing.

In [None]:
#Define a new dataframe
institution = list(rank.institution.unique())
non_decreasing = pd.DataFrame(data=institution,columns=['institution'])
non_decreasing.head()

In [None]:
#filtering 
def non_decreasing_rank(institution):
    world_rank = list(rank[rank.institution == institution]['world_rank'])
    count = rank.groupby('institution').size()[institution]
    for i in range(1,count): #1,2,3
        if world_rank[i-1] < world_rank[i]:
            return False
    return True

#Moment of true
non_decreasing[non_decreasing['institution'].apply(non_decreasing_rank) == True]

## Question 2 : Which factors are more determining at rankings

In [None]:
#We are using 2015 figures
rank2015 = rank[rank.year == 2015]
rank2015.drop(['country','national_rank','year','broad_impact'],axis = 1, inplace = True)
rank2015.head()

In [None]:
y = rank2015.quality_of_education.max() + 1

In [None]:
factor = list(rank2015.columns.values)[2:9]
factor

In [None]:
for i in range(len(factor)):
    z = rank2015[factor[i]].apply(lambda x:y-x)
    plt.figure(i)
    sns.regplot(x=z, y='score', data = rank2015)

In [None]:
cor = pd.DataFrame()
for i in range(len(factor)):
    cor[factor[i]] = rank2015[factor[i]].apply(lambda x:y-x)
cor['score'] = rank2015.score
cor.corr() 

## Question 3: Predict University Rankings based on different factors

In [None]:
score = rank.score
train = rank[factor] 
lab_enc = preprocessing.LabelEncoder()
score_encoded = lab_enc.fit_transform(score)

In [None]:
x_train, y_train, x_test, y_test = train_test_split(train,score_encoded,train_size = 0.9, random_state = 0)

In [None]:
#Decision Tree
tree = DecisionTreeClassifier()
tree.fit(x_train,x_test)
y_pred = tree.predict(y_train)
y1 = lab_enc.inverse_transform(y_test)
y2 = lab_enc.inverse_transform(y_pred)

In [None]:
np.corrcoef(y1,y2)
sns.regplot(y1,y2)

It looks like the predicted values are very similar to true values. 

In [None]:
#Percentage of predicted score lies in between ±0.5 of the true score. 
fit = 0
for i in range(len(y1)):
    if (y1[i] - 0.5) <= y2[i] <= (y1[i] + 0.5):
        fit = fit + 1
        
print(fit/len(y1))

DecisionTree is useful for classifying discrete random variables or binned continous random variables. In the case of dealing with unbinned continous variables, it is actually not too bad. But, there must be better models.

In [None]:
#K-Nearest Neighbors
neigh = KNeighborsClassifier()
neigh.fit(x_train, x_test)
y_pred = neigh.predict(y_train)
y1 = lab_enc.inverse_transform(y_test)
y2 = lab_enc.inverse_transform(y_pred)

In [None]:
sns.regplot(y1,y2)
np.corrcoef(y1,y2)

In [None]:
#Percentage of predicted score lies in between ±0.5 of the true score. 
fit = 0
for i in range(len(y1)):
    if (y1[i] - 0.5) <= y2[i] <= (y1[i] + 0.5):
        fit = fit + 1
        
print(fit/len(y1))

In [None]:
#Random Forest
forest = RandomForestClassifier()
forest.fit(x_train,x_test)
y_pred = forest.predict(y_train)
y1 = lab_enc.inverse_transform(y_test)
y2 = lab_enc.inverse_transform(y_pred)

In [None]:
sns.regplot(y1,y2)
np.corrcoef(y1,y2)

Random Forest is the strongest classifier here

In [None]:
#Percentage of predicted score lies in between ±0.5 of the true score. 
fit = 0
for i in range(len(y1)):
    if (y1[i] - 0.5) <= y2[i] <= (y1[i] + 0.5):
        fit = fit + 1
        
print(fit/len(y1))