In [1]:
import numpy as np
from scipy.stats import norm

from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.neural_network import MLPRegressor as MLPR
from sklearn.linear_model import LinearRegression as LR
from sklearn.neighbors import KNeighborsRegressor as KNR

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

In [2]:
import matplotlib.pyplot as plt

# Initial Gathered data

In [3]:
data = np.load("data1.npy")
np.random.shuffle(data)
X = data[:, :-1]
y = data[:, -1]
X.shape, y.shape
Z = np.hstack([X[:, 0:8], X[:, -6:]]) # Data without nominal features
W = X[:, 8:-6] # Just nominal features

### Linear Regression

In [5]:
lr = LR()
cv_results = cross_validate(lr, X, y, cv=7, scoring='neg_mean_absolute_error')
print(cv_results['test_score'])
print("Average error:", np.mean(cv_results['test_score']))

[-0.04843867 -0.05147465 -0.05338436 -0.0525197  -0.05114465 -0.04779964
 -0.0505979 ]
Average error: -0.050765651981586044


In [124]:
lr = LR()
cv_results = cross_validate(lr, Z, y, cv=7, scoring='neg_mean_absolute_error')
print(cv_results['test_score'])
print("Average error:", np.mean(cv_results['test_score']))

[-0.05914805 -0.05358079 -0.05948209 -0.05569176 -0.05814874 -0.05645561
 -0.05900646]
Average error: -0.057359069231220104


### MLP

In [6]:
mlp = MLPR([100, 200, 100], activation='logistic')
cv_results = cross_validate(mlp, X, y, cv=7, scoring='neg_mean_absolute_error')
print(cv_results['test_score'])
print("Average error:", np.mean(cv_results['test_score']))

[-0.04900858 -0.05103336 -0.05539654 -0.05259797 -0.05219809 -0.04973941
 -0.05053577]
Average error: -0.05150138812338042


In [126]:
mlp = MLPR([100, 200, 100], activation='logistic')
cv_results = cross_validate(mlp, Z, y, cv=7, scoring='neg_mean_absolute_error')
print(cv_results['test_score'])
print("Average error:", np.mean(cv_results['test_score']))

[-0.06052447 -0.05438368 -0.06162408 -0.06239378 -0.06702681 -0.05591894
 -0.06419144]
Average error: -0.06086617238912227


### Random Forest

In [61]:
rf = RFR(n_estimators=1000, max_depth=30, max_features="sqrt")
cv_results = cross_validate(rf, X, y, cv=7, scoring='neg_mean_absolute_error')
print(cv_results['test_score'])
print("Average error:", np.mean(cv_results['test_score']))

[-0.04859459 -0.04791038 -0.04787409 -0.04673771 -0.04911457 -0.04906312
 -0.04611455]
Average error: -0.04791557263249736


In [127]:
rf = RFR(n_estimators=1000, max_depth=30, max_features="sqrt")
cv_results = cross_validate(rf, Z, y, cv=7, scoring='neg_mean_absolute_error')
print(cv_results['test_score'])
print("Average error:", np.mean(cv_results['test_score']))

[-0.05315613 -0.0488518  -0.05177235 -0.05085102 -0.0507854  -0.04921602
 -0.05711803]
Average error: -0.05167867756999857


### KNN

In [72]:
knr = KNR(n_neighbors=10)
cv_results = cross_validate(knr, X, y, cv=7, scoring='neg_mean_absolute_error')
print(cv_results['test_score'])
print("Average error:", np.mean(cv_results['test_score']))

[-0.05656856 -0.0543181  -0.055      -0.05607313 -0.05739305 -0.05685531
 -0.05667216]
Average error: -0.05612575802364258


# Further engineered data

In [370]:
X, y = np.load("X_normalized.npy"), np.load("y.npy")
Xy = np.hstack([X, y.reshape(-1, 1)])
np.random.shuffle(Xy)
X = Xy[:, :-1]
y = Xy[:, -1]

### Linear Regression

In [322]:
lr = LR()
cv_results = cross_validate(lr, X, y, cv=7, scoring='neg_mean_absolute_error')
print(cv_results['test_score'])
print("Average error:", np.mean(cv_results['test_score']))

[-0.04842674 -0.05042793 -0.05013717 -0.04920444 -0.05263147 -0.05222402
 -0.05195048]
Average error: -0.05071460749017986


### MLP

In [373]:
mlp = MLPR([100, 200, 100], activation='logistic')
cv_results = cross_validate(mlp, X, y, cv=7, scoring='neg_mean_absolute_error')
print(cv_results['test_score'])
print("Average error:", np.mean(cv_results['test_score']))

[-0.05532279 -0.05172455 -0.05496613 -0.05148859 -0.0529052  -0.05057765
 -0.05203554]
Average error: -0.0527172064499352


### Random Forest

In [82]:
rf = RFR(n_estimators=1000, max_depth=30, max_features="sqrt")
cv_results = cross_validate(rf, X, y, cv=7, scoring='neg_mean_absolute_error')
print(cv_results['test_score'])
print("Average error:", np.mean(cv_results['test_score']))

[-0.04699509 -0.04865578 -0.0488202  -0.04648107 -0.04759942 -0.0502361
 -0.04704309]
Average error: -0.04797582117451038


# Engineered labels

In [4]:
class IMDB_Labels:
    def __init__(self, y, factor=1.6):
        self.factor = factor
        self.y = y
        self.normal = y**factor
        self.mean = np.mean(self.normal)
        self.std = np.sqrt(np.var(self.normal))
        self.uniform = norm.cdf(self.normal, 
                                loc=self.mean, 
                                scale=self.std)
    
    def revert(self, y):
        normal = norm.ppf(y, loc=self.mean, scale=self.std)
        return normal**(1/self.factor)

In [66]:
X, y = np.load("X_normalized.npy"), np.load("y.npy")
Xy = np.hstack([X, y.reshape(-1, 1)])
np.random.shuffle(Xy)
X = Xy[:, :-1]
y = Xy[:, -1]
labels = IMDB_Labels(y)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, labels.uniform, train_size=0.3)

In [60]:
rf = RFR(n_estimators=1000, max_depth=30, max_features="sqrt")
rf.fit(X_train, y_train)
y_pred = mlp.predict(X_test)

In [61]:
y_pred[y_pred >= 1] = y_train.max()
y_pred[y_pred <= 0] = y_train.min()

In [62]:
y_pred_ = labels.revert(y_pred)

In [69]:
abs(y.mean() - y).mean()

0.08186567612064759

In [65]:
labels.revert(y_test)[:50]

array([0.5 , 0.58, 0.76, 0.72, 0.59, 0.78, 0.57, 0.67, 0.74, 0.63, 0.55,
       0.48, 0.67, 0.72, 0.7 , 0.52, 0.74, 0.7 , 0.71, 0.78, 0.49, 0.59,
       0.78, 0.47, 0.54, 0.67, 0.79, 0.67, 0.65, 0.6 , 0.8 , 0.59, 0.67,
       0.59, 0.58, 0.58, 0.65, 0.7 , 0.56, 0.5 , 0.81, 0.81, 0.7 , 0.57,
       0.7 , 0.56, 0.64, 0.55, 0.33, 0.53])

# Unengineered

In [58]:
data = np.load("NE_data.npy")
X = data[:, :-1]
y = data[:, -1]

In [71]:
lr = LR()
cv_results = cross_validate(lr, X, y, cv=7, scoring='neg_mean_absolute_error')
print(cv_results['test_score'])
print("Average error:", np.mean(cv_results['test_score']))

[-0.05357368 -0.05273522 -0.05154145 -0.04912436 -0.05048574 -0.05138884
 -0.04660321]
Average error: -0.05077892795794896


In [72]:
mlp = MLPR([100, 200, 100], activation='logistic')
cv_results = cross_validate(mlp, X, y, cv=7, scoring='neg_mean_absolute_error')
print(cv_results['test_score'])
print("Average error:", np.mean(cv_results['test_score']))

[-0.05381675 -0.05526925 -0.05233026 -0.05018833 -0.05046268 -0.0522355
 -0.04850822]
Average error: -0.05183014323338837


In [73]:
rf = RFR(n_estimators=1000, max_depth=30, max_features="sqrt")
cv_results = cross_validate(rf, X, y, cv=7, scoring='neg_mean_absolute_error')
print(cv_results['test_score'])
print("Average error:", np.mean(cv_results['test_score']))

[-0.04986848 -0.04936366 -0.04941054 -0.04529876 -0.04778694 -0.04869685
 -0.04505248]
Average error: -0.04792538670781332


In [74]:
knr = KNR(n_neighbors=10)
cv_results = cross_validate(knr, X, y, cv=7, scoring='neg_mean_absolute_error')
print(cv_results['test_score'])
print("Average error:", np.mean(cv_results['test_score']))

[-0.05991773 -0.058883   -0.057117   -0.05492322 -0.05807861 -0.05700916
 -0.05462454]
Average error: -0.05722189441097771


In [90]:
data = np.load("NE_data.npy")
X = data[:, :-1]
y = data[:, -1]

In [116]:
Z = np.hstack([X[:, 0:8], X[:, -6:]])
W = X[:, 8:-6]

In [112]:
lr = LR()
cv_results = cross_validate(lr, Z, y, cv=7, scoring='neg_mean_absolute_error')
print(cv_results['test_score'])
print("Average error:", np.mean(cv_results['test_score']))

[-0.06595784 -0.05368395 -0.0580389  -0.06521339 -0.06794026 -0.07148303
 -0.08448462]
Average error: -0.06668599932055987


In [113]:
mlp = MLPR([100, 200, 100], activation='logistic')
cv_results = cross_validate(mlp, Z, y, cv=7, scoring='neg_mean_absolute_error')
print(cv_results['test_score'])
print("Average error:", np.mean(cv_results['test_score']))

[-0.07856632 -0.07209986 -0.08150523 -0.08635458 -0.0882197  -0.08499739
 -0.09449917]
Average error: -0.08374889355246573


In [115]:
rf = RFR(n_estimators=1000, max_depth=30, max_features="sqrt")
cv_results = cross_validate(rf, Z, y, cv=7, scoring='neg_mean_absolute_error')
print(cv_results['test_score'])
print("Average error:", np.mean(cv_results['test_score']))

[-0.05366704 -0.04700653 -0.0466757  -0.05309454 -0.05419429 -0.05722049
 -0.06906141]
Average error: -0.05441714267340979


In [121]:
mlp = MLPR([100, 200, 100], activation='logistic')
cv_results = cross_validate(mlp, W, y, cv=7, scoring='neg_mean_absolute_error')
print(cv_results['test_score'])
print("Average error:", np.mean(cv_results['test_score']))

[-0.08232975 -0.06931456 -0.07610319 -0.08583467 -0.07693066 -0.08219183
 -0.08451673]
Average error: -0.07960305682342253


In [122]:
rf = RFR(n_estimators=1000, max_depth=30, max_features="sqrt")
cv_results = cross_validate(rf, W, y, cv=7, scoring='neg_mean_absolute_error')
print(cv_results['test_score'])
print("Average error:", np.mean(cv_results['test_score']))

[-0.08275721 -0.067161   -0.06886627 -0.07430232 -0.07961781 -0.07864753
 -0.08206886]
Average error: -0.07620299929534814
