## Objective - Rank universities based on different features

#### Import modules and prepare dataset.

In [13]:
# Import sklearn tools
from sklearn.svm import SVR
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.manifold import TSNE
from sklearn.model_selection import cross_validate, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Import other libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import bisect
from random import choice
import os
from IPython.display import display
from sklearn.metrics import mean_squared_error

# Load dataset
dataset = pd.read_csv('shanghai_data.csv')

# Check which columns have null values.
for col in dataset.columns:
    pct_missing = np.mean(dataset[col].isnull())
    print(f"% of missing data in '{col}' - {pct_missing*100:.2f}%")

# Fill the data.
dataset.fillna(dataset.mean(), inplace=True)

# Checking for duplicates. NOTE: world_rank will become the target.
print('Duplication in dataset: {}'.format(dataset.duplicated(subset=['world_rank']).any()))    # -> True

# NOTE: There are duplicates in the dataset because the rankings change every year, so the rankings are updated.

# Split data into target and data variables.]
data = dataset.loc[:, 'total_score':'pcp'].to_numpy()

# Encoding target colum (target is not continuous).
target = dataset.loc[:, 'world_rank'].to_numpy()

# University names (for reference).
university_names = dataset.loc[:, 'university_name'].to_numpy()

print(f'target.dtype: {target.dtype}')    # -> object

# NOTE: The reason why the above code returns object is bc the data is string,
#       but the strings must be the same length, so it is stored in this dtype.

# Get indexes where the targets can be converted to floats.
indexes = []

for index in range(len(target)):
    if '-' not in target[index]:
        indexes.append(index)

print('target: {}'.format(target := target[indexes].astype(float)))

% of missing data in 'world_rank' - 0.00%
% of missing data in 'university_name' - 0.02%
% of missing data in 'national_rank' - 0.02%
% of missing data in 'total_score' - 77.52%
% of missing data in 'alumni' - 0.02%
% of missing data in 'award' - 0.04%
% of missing data in 'hici' - 0.04%
% of missing data in 'ns' - 0.45%
% of missing data in 'pub' - 0.04%
% of missing data in 'pcp' - 0.04%
% of missing data in 'year' - 0.00%
Duplication in dataset: True
target.dtype: object
target: [  1.   2.   3. ...  98.  99. 100.]
