In [97]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [98]:
df = pd.read_csv("data/nces330_20.csv")
df

Unnamed: 0,Year,State,Type,Length,Expense,Value
0,2013,Alabama,Private,4-year,Fees/Tuition,13983
1,2013,Alabama,Private,4-year,Room/Board,8503
2,2013,Alabama,Public In-State,2-year,Fees/Tuition,4048
3,2013,Alabama,Public In-State,4-year,Fees/Tuition,8073
4,2013,Alabama,Public In-State,4-year,Room/Board,8473
...,...,...,...,...,...,...
3543,2021,Wyoming,Public In-State,2-year,Fees/Tuition,3987
3544,2021,Wyoming,Public In-State,4-year,Room/Board,9799
3545,2021,Wyoming,Public Out-of-State,2-year,Fees/Tuition,9820
3546,2021,Wyoming,Public Out-of-State,4-year,Fees/Tuition,14710


In [99]:
df.describe()

Unnamed: 0,Year,Value
count,3548.0,3548.0
mean,2016.92,13027.72
std,2.55,8734.57
min,2013.0,1225.0
25%,2015.0,7756.75
50%,2017.0,10203.5
75%,2019.0,14830.75
max,2021.0,49152.0


In [100]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3548 entries, 0 to 3547
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Year     3548 non-null   int64 
 1   State    3548 non-null   object
 2   Type     3548 non-null   object
 3   Length   3548 non-null   object
 4   Expense  3548 non-null   object
 5   Value    3548 non-null   int64 
dtypes: int64(2), object(4)
memory usage: 166.4+ KB


In [101]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
df["State"] = labelencoder.fit_transform(df.State)
df["Type"] = labelencoder.fit_transform(df.Type)
df["Length"] = labelencoder.fit_transform(df.Length)
df["Expense"] = labelencoder.fit_transform(df.Expense)
df

Unnamed: 0,Year,State,Type,Length,Expense,Value
0,2013,0,0,1,0,13983
1,2013,0,0,1,1,8503
2,2013,0,1,0,0,4048
3,2013,0,1,1,0,8073
4,2013,0,1,1,1,8473
...,...,...,...,...,...,...
3543,2021,50,1,0,0,3987
3544,2021,50,1,1,1,9799
3545,2021,50,2,0,0,9820
3546,2021,50,2,1,0,14710


In [102]:
copy_df = df

In [103]:
y = df.Value
df = df.drop(labels="Value", axis=1)
X = df

In [104]:
from sklearn.preprocessing import MinMaxScaler

In [105]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
X

array([[0. , 0. , 0. , 1. , 0. ],
       [0. , 0. , 0. , 1. , 1. ],
       [0. , 0. , 0.5, 0. , 0. ],
       ...,
       [1. , 1. , 1. , 0. , 0. ],
       [1. , 1. , 1. , 1. , 0. ],
       [1. , 1. , 1. , 1. , 1. ]])

In [106]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y.values, test_size=0.2, random_state=42)

In [107]:
clf = CatBoostRegressor(loss_function='MAPE')
train_dataset = Pool(data=X_train,
                     label=y_train,
                     )
eval_dataset = Pool(data=X_test,
                    label=y_test,
                    )
clf.fit(train_dataset,
          use_best_model=True,
          verbose = 0,
          eval_set=eval_dataset)

<catboost.core.CatBoostRegressor at 0x230cb6d4b50>

In [108]:
def mape(y_true, y_pred):
    return np.mean(np.abs((y_pred-y_true)/y_true))

In [109]:
y_pred = clf.predict(Pool(data=X_test))

print(f"MAPE: {mape(y_test, y_pred)}")
print(f"MAE: {mean_absolute_error(y_test, y_pred)}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}")

MAPE: 0.30927351920087615
MAE: 4422.312399886303
RMSE: 7560.998396331738


In [110]:
n_fold = 8
cv = KFold(n_splits=n_fold, shuffle=True, random_state=42)
mape_, mae, rmse = [], [], []

params = {
            'iterations':1000,
            'learning_rate':0.1,
            'depth':5,
            'eval_metric':'MAPE'
}

y = copy_df.Value
copy_df = copy_df.drop(labels="Value", axis=1)
X = df
X = scaler.fit_transform(X)
X = pd.DataFrame(X)

for fold, (train_index, val_index) in enumerate(cv.split(X)):
    X_train = X.iloc[train_index,:]
    X_val = X.iloc[val_index,:]

    y_train = y.iloc[train_index]
    y_val = y.iloc[val_index]

    clf = CatBoostRegressor(**params)

    train_dataset = Pool(data=X_train,
                     label=y_train,
                     )

    eval_dataset = Pool(data=X_test,
                    label=y_test,
                    )

    clf.fit(train_dataset,
              use_best_model=True,
              verbose = 0,
              eval_set=eval_dataset)

    y_pred = clf.predict(Pool(data=X_test))

    mape_.append(mape(y_test, y_pred))
    mae.append(mean_absolute_error(y_test, y_pred))
    rmse.append(np.sqrt(mean_squared_error(y_test, y_pred)))

    print(f"fold: {fold}, MAPE: {mape(y_test, y_pred)}")
    print(f"fold: {fold}, MAE: {mean_absolute_error(y_test, y_pred)}")
    print(f"fold: {fold}, RMSE: {np.sqrt(mean_squared_error(y_test, y_pred))}")

print('CV mean MAPE:  {0:.4f}, std: {1:.4f}.'.format(np.mean(mape_), np.std(mape_)))
print('CV mean MAE: {0:.4f}, std: {1:.4f}.'.format(np.mean(mae), np.std(mae)))
print('CV mean RMSE: {0:.4f}, std: {1:.4f}.'.format(np.mean(rmse), np.std(rmse)))

fold: 0, MAPE: 0.07314617559464659
fold: 0, MAE: 755.8044311116089
fold: 0, RMSE: 1173.4791513629434
fold: 1, MAPE: 0.06916747129106728
fold: 1, MAE: 695.944955018769
fold: 1, RMSE: 983.5708160171162
fold: 2, MAPE: 0.06323949931491434
fold: 2, MAE: 641.2114850335753
fold: 2, RMSE: 917.4923732577126
fold: 3, MAPE: 0.0656532572899688
fold: 3, MAE: 668.7772716267349
fold: 3, RMSE: 978.1642304999854
fold: 4, MAPE: 0.06380382268102135
fold: 4, MAE: 647.219385930652
fold: 4, RMSE: 932.2904183646397
fold: 5, MAPE: 0.0629676299626333
fold: 5, MAE: 645.8948969005404
fold: 5, RMSE: 972.200867859984
fold: 6, MAPE: 0.06557494855790183
fold: 6, MAE: 664.1155256494832
fold: 6, RMSE: 952.563253692791
fold: 7, MAPE: 0.06358632918893609
fold: 7, MAE: 651.3089005048217
fold: 7, RMSE: 920.2479470910454
CV mean MAPE:  0.0659, std: 0.0033.
CV mean MAE: 671.2846, std: 35.9455.
CV mean RMSE: 978.7511, std: 77.4638.


The mean absolute percentage error (MAPE), also known as mean absolute percentage deviation (MAPD), is a measure of prediction accuracy of a forecasting method in statistics. It usually expresses the accuracy as a ratio defined by the formula:

![test](images/15cea473-431c-40c8-8e75-73d0f6e15dd1.png))

# Final Results

CV mean MAPE:  0.0659, std: 0.0033.
CV mean MAE: 671.2846, std: 35.9455.
CV mean RMSE: 978.7511, std: 77.4638.

These are very good results, since the MAPE metric was approximately 6.5%.