## Regression and Actor values

We now check if we have debiased actor values given these confounders.

In [None]:
# Regression with confounders
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

In [None]:
#Import Data
#Y_Scaled = StandardScaler().fit_transform(Y.reshape(-1,1))

In [None]:
def fitModel(X, Y, title, test_size=0.8, cat=False, model=LinearRegression()):

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=42)
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    if not cat: 
        score = r2_score(Y_test, Y_pred)
    else:
        score = model.score(X_test, Y_test)
    plt.plot(Y_pred, Y_test, 'o')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(title)
    plt.show()
    
    print("Root Mean squared Error:", np.sqrt(mean_squared_error(Y_test, Y_pred)))
    #return model.coef_
    return model.coef_, Y_pred, Y_test

### Regression on Revenue with only Actors (no confounders).

In [None]:
coef_noCF_Y = fitModel(x_train.to_numpy(), Y_Scaled, title = "Regression with Actors", cat = False)

### Regression on Revenue with Actors and Confounders

In [None]:
data_CF = np.hstack((x_train.to_numpy(), X_CF))

In [None]:
coef_X_CF_Y = fitModel(data_CF, Y_Scaled, title = "Regression with Actors and Confounders", cat = False)

RMSE has decreased drastically.

Nowe we see which actors have changed coefficient values and in which direction (i.e overvalued or undervalued).

In [None]:
actor_coefs_noCF = list(coef_noCF_Y[0][0])

In [None]:
actor_coefs_X_CF = list(coef_X_CF_Y[0][0])[:-30]

In [None]:
all_actors = list(x_train.columns)

In [None]:
def findMaxChanges(noCF_coefs, XCF_coefs, actors):
    
    assert(len(noCF_coefs) == len(XCF_coefs))
    fin_list = []
    for i in range(len(noCF_coefs)):
        actor = actors[i]
        diff = XCF_coefs[i] - noCF_coefs[i]       # Difference between coef with CF and without CF
        magnitude = np.linalg.norm(diff)          # How much change
        direction = 'Overvalued' if np.sign(diff) == -1.0 else 'Undervalued'   # In what direction?
        
        # If direction is -ve i.e coef with CF is lesser than coef without CF it means that actor was being overvalued
        
        fin_list.append((actor, magnitude, direction))
        
    # Sort by most changes
    fin_list.sort(key=lambda x: x[1], reverse = True)
    df_toret = pd.DataFrame(fin_list, columns = ['Actor', 'Magnitude', 'Direction'])
    return fin_list, df_toret

In [None]:
tests = findMaxChanges(actor_coefs_noCF, actor_coefs_X_CF, all_actors)

## David Blei's study.

We compared if our overvalued actors are the same as our reference paper found them to be. <br>

Overvalued Brad Pitt, Robert De niro <br>
Undervalued: Nicolas Cage, Cate Blanchett, Antonio Banderas <br>
Most Improved: Ben Affleck <br>

Overvalued actors: Robert De Niro, Brad Pitt

In [None]:
tests[1][tests[1]['Actor'] == 'Robert De Niro']

In [None]:
tests[1][tests[1]['Actor'] == 'Brad Pitt']

In [None]:
tests[1][tests[1]['Actor'] == 'Nicolas Cage']

In [None]:
tests[1][tests[1]['Actor'] == 'Cate Blanchett']

Undervalued: Nicolas Cage, Cate Blanchett, Antonio Banderas

In [None]:
tests[1][tests[1]['Actor'] == 'Antonio Banderas']

In [None]:
tests[1][tests[1]['Actor'] == 'Ben Affleck']

**Is miss Monneyppenny overvalued?** <br>
Looking at who played M in Skyfall: Judi Dench, and who played Bond: Daniel Craig

In [None]:
tests[1][tests[1]['Actor'] == 'Judi Dench']

In [None]:
tests[1][tests[1]['Actor'] == 'Daniel Craig']

## HMC
Really slow but works without a guide function. This attempt is only with 500 columns and not all 13852 columns.

In [None]:
from pyro.infer.mcmc import HMC, MCMC
# from pyro.infer import mcmc

hmc_kernel = HMC(ppca_model, step_size=0.0855, num_steps=4)

x_new = x_train_tensors[:, :500]

ppca_model(x_new)

mcmc_run = MCMC(hmc_kernel, num_samples=500, warmup_steps=100).run(x_new)

In [None]:
from pyro.infer import EmpiricalMarginal
posterior_z = EmpiricalMarginal(mcmc_run, 'z')

In [None]:
posterior_z.mean