In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set_style('whitegrid')


Now let us load the data.

In [None]:
df=pd.read_csv('../input/world-university-rankings/cwurData.csv')


In [None]:
df.head()


Looking at the head of our data, we can see that there are several columns including the ranking, institution name, country, and various parameters in the basis of which one could predict the university ranking!

In [None]:
df.info()


Wow! Such a relief that there are no null values in any of the columns. It would save our time from doing feature engineering on the data!



In [None]:
df.describe().T


In [None]:
df.columns


In [None]:
df.isnull().sum()


In [None]:
df['broad_impact'].nunique()


In [None]:
plt.figure(figsize=(12,6))
sns.heatmap(df.corr(), annot=True)

Taking a close look at the above heatmap, we can conclude that:

World Rank is highly correlated toBroad Impact, Citations, Influence and Publications.

It is also somewhat correlated to Patents, Quality of Faculty, Alumni Employment, Quality of Education etc.

In [None]:
df.cov()

In [None]:
sns.barplot(x=df['world_rank'].head(5), y=df['publications'])

In [None]:
df['country'].unique()

In [None]:
df['country'].value_counts()

In [None]:
df.drop('broad_impact', axis=1, inplace=True)


In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

In [None]:
df.isnull().sum()

In [None]:
encoder = LabelEncoder()

df['country'] = encoder.fit_transform(df['country'])
country_mappings = {index: label for index, label in enumerate(encoder.classes_)}

In [None]:
y = df['world_rank']
X = df.drop('world_rank', axis=1)

In [None]:
df.drop('institution', axis=1, inplace=True)
df.drop('year', axis=1, inplace=True)

In [None]:
y = df['world_rank']
X = df.drop('world_rank', axis=1)

In [None]:
scaler = StandardScaler()

X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=101)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
print(f"Model R^2: {model.score(X_test, y_test)}")