In [40]:
import numpy as np
from sklearn.linear_model import LinearRegression
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples
from sklearn.cluster import DBSCAN
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import warnings
warnings.filterwarnings("ignore")

In [41]:
df = pd.read_csv('movieReplicationSet-2.csv')
labels_columns = df.columns[:400].to_list()
labels= df[labels_columns]
factors = df.columns[474:477].to_list() # gender, sibship, social preference
factors = df[factors]
factors = factors[~np.isnan(factors).any(axis=1)]

In [42]:
RMSE = []
for i in range(400): 
    dataset=pd.DataFrame.merge(factors,labels[labels_columns[i]], left_index =True , right_index = True,how = 'inner')
    dataset.dropna(inplace= True)
    x = dataset.iloc[:,:3]
    y = dataset.iloc[:,3:]
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 100)
    
    from sklearn.model_selection import RandomizedSearchCV
    n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)] # Num of trees in random forest
    max_features = ['auto', 'sqrt'] # Num of features to consider at every split
    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)] # Max number of levels in tree
    max_depth.append(None) # Min number of samples required to split a node
    min_samples_split = [2, 5, 10] # Min number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4] # Method of selecting samples for training each tree
    bootstrap = [True, False]
    
    # Create random grid
    random_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap}
    
    # cross-validation
    rf = RandomForestRegressor()
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, cv = 3, verbose=2, random_state=42, n_jobs = -1)
    rf_random.fit(x_train, y_train.values.ravel()) # fit the random search model
    best_random = rf_random.best_estimator_
    testing= best_random.predict(x_test)
    rootMeanSqErr = np.sqrt(metrics.mean_squared_error(y_test, testing))
    print('Root mean squared error is '+str(round(rootMeanSqErr,2)))
    dataset['predicted_values'] = 0  # score on all data
    y_pred_mlr= best_random.predict(x)
    dataset['predicted_values'] = y_pred_mlr
    print(labels_columns[i])
    globals()['dataset_%s' % labels_columns[i][:-7].replace(' ','')] = dataset
    print('this is the DF dataset_%s' % labels_columns[i][:-7].replace(' ',''))
    print('-------')
    RMSE.append(rootMeanSqErr)
    

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Root mean squared error is 1.15
The Life of David Gale (2003)
this is the DF dataset_TheLifeofDavidGale
-------
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Root mean squared error is 1.1
Wing Commander (1999)
this is the DF dataset_WingCommander
-------
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Root mean squared error is 0.89
Django Unchained (2012)
this is the DF dataset_DjangoUnchained
-------
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Root mean squared error is 1.23
Alien (1979)
this is the DF dataset_Alien
-------
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Root mean squared error is 0.88
Indiana Jones and the Last Crusade (1989)
this is the DF dataset_IndianaJonesandtheLastCrusade
-------
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Root mean squared error is 1.11
Snatch (2000)
this is the DF dataset_Snatch
-------
Fitting 3 folds for 

KeyboardInterrupt: 

In [43]:
sum(RMSE)/len(RMSE)

1.0589102670616064

In [36]:
RMSE2 = []
#linear regression

for i in range(400): 
    dataset=pd.DataFrame.merge(factors,labels[labels_columns[i]], left_index =True , right_index = True,how = 'inner')
    dataset.dropna(inplace= True)
    x = dataset.iloc[:,:3]
    y = dataset.iloc[:,3:]
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 100)
    from sklearn.model_selection import RandomizedSearchCV
 
    # cross-validation scheme
    mlr = LinearRegression()  
 
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    parameters = {'normalize':[True,False]}
    mlr_random = RandomizedSearchCV(estimator = mlr, param_distributions = parameters, n_iter = 10, cv = 3, verbose=2, random_state=42, n_jobs = -1)
    
    # Fit the random search model
    best_mlr = mlr_random.fit(x_train, y_train)
    testing= best_mlr.predict(x_test)
    rootMeanSqErr = np.sqrt(metrics.mean_squared_error(y_test, testing))
    print('Root mean squared error is '+str(round(rootMeanSqErr,2)))
    dataset['predicted_values'] = 0 # score on all data
    y_pred_mlr= best_mlr.predict(x)
    dataset['predicted_values'] = y_pred_mlr
    print(labels_columns[i])
    globals()['dataset_%s' % labels_columns[i][:-7].replace(' ','')] = dataset
    print('this is the DF dataset_%s' % labels_columns[i][:-7].replace(' ',''))
    print('-------')
    RMSE2.append(rootMeanSqErr)

Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 1.23
The Life of David Gale (2003)
this is the DF dataset_TheLifeofDavidGale
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 1.17
Wing Commander (1999)
this is the DF dataset_WingCommander
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 0.9
Django Unchained (2012)
this is the DF dataset_DjangoUnchained
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 1.2
Alien (1979)
this is the DF dataset_Alien
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 0.89
Indiana Jones and the Last Crusade (1989)
this is the DF dataset_IndianaJonesandtheLastCrusade
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 1.09
Snatch (2000)
this is the DF dataset_Snatch
-------
Fitting 3 folds for each of 2 can

Root mean squared error is 1.11
Lost in Translation (2003)
this is the DF dataset_LostinTranslation
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 1.1
Star Trek: The Motion Picture (1979)
this is the DF dataset_StarTrek:TheMotionPicture
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 0.86
Inglorious Bastards (2009)
this is the DF dataset_IngloriousBastards
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 0.88
Clueless (1995)
this is the DF dataset_Clueless
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 1.15
The Omen (1976)
this is the DF dataset_TheOmen
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 1.01
Shrek 2 (2004)
this is the DF dataset_Shrek2
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 0.85
Good Will Hu

Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 1.04
Beetle Juice (1988)
this is the DF dataset_BeetleJuice
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 1.3
Andaz Apna Apna (1994)
this is the DF dataset_AndazApnaApna
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 0.99
The Proposal (2009)
this is the DF dataset_TheProposal
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 0.92
The Shining (1980)
this is the DF dataset_TheShining
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 0.96
The Land That Time Forgot (1974)
this is the DF dataset_TheLandThatTimeForgot
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 0.96
The Perfect Storm (2000)
this is the DF dataset_ThePerfectStorm
-------
Fitting 3 folds for each of 2 candidates, 

this is the DF dataset_Room
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 1.19
Scream (1996)
this is the DF dataset_Scream
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 1.15
The Evil Dead (1981)
this is the DF dataset_TheEvilDead
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 1.07
Gangs of New York (2002)
this is the DF dataset_GangsofNewYork
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 0.92
Stand By Me (1986)
this is the DF dataset_StandByMe
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 1.03
The Vow (2012)
this is the DF dataset_TheVow
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 0.79
Toy Story 3 (2010)
this is the DF dataset_ToyStory3
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits

Root mean squared error is 1.21
Anger Management (2002)
this is the DF dataset_AngerManagement
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 1.08
Angels in the Outfield (1994)
this is the DF dataset_AngelsintheOutfield
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 0.95
Wild Wild West (1999)
this is the DF dataset_WildWildWest
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 1.13
Split (2016)
this is the DF dataset_Split
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 1.0
Bad Boys (1995)
this is the DF dataset_BadBoys
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 0.81
The Prestige (2006)
this is the DF dataset_ThePrestige
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 1.45
American Graffiti (1973)
this is the 

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline wi

Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 1.11
Close Encounters of the Third Kind (1977)
this is the DF dataset_CloseEncountersoftheThirdKind
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 1.08
Hollow Man (2000)
this is the DF dataset_HollowMan
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 1.01
Point Break (1991)
this is the DF dataset_PointBreak
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 0.87
I Robot (2004)
this is the DF dataset_IRobot
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 0.88
The Dark Knight (2008)
this is the DF dataset_TheDarkKnight
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 0.86
Ghost (1990)
this is the DF dataset_Ghost
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
R

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline wi


Root mean squared error is 1.01
L.A. Confidential (1997)
this is the DF dataset_L.A.Confidential
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 0.93
Chicago (2002)
this is the DF dataset_Chicago
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 1.09
Star Wars: Episode 1 - The Phantom Menace (1999)
this is the DF dataset_StarWars:Episode1-ThePhantomMenace
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 0.9
Rain Man (1988)
this is the DF dataset_RainMan
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 1.11
What Lies Beneath (2000)
this is the DF dataset_WhatLiesBeneath
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 0.88
Toy Story (1995)
this is the DF dataset_ToyStory
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 1

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline wi

Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 1.08
Double Jeopardy (1999)
this is the DF dataset_DoubleJeopardy
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 1.07
The Big Lebowski (1998)
this is the DF dataset_TheBigLebowski
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 0.87
The Matrix (1999)
this is the DF dataset_TheMatrix
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 1.08
The Lord of the Rings: The Return of the King (2003)
this is the DF dataset_TheLordoftheRings:TheReturnoftheKing
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 1.08
Reservoir Dogs (1992)
this is the DF dataset_ReservoirDogs
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 1.08
Heavy Traffic (1973)
this is the DF dataset_HeavyTraffic
-------
Fit

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline wi

Root mean squared error is 1.06
Flowers in the Attic (1987)
this is the DF dataset_FlowersintheAttic
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 0.91
28 Days Later (2002)
this is the DF dataset_28DaysLater
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 1.02
The Princess Bride (1987)
this is the DF dataset_ThePrincessBride
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 0.94
The Green Mile (1999)
this is the DF dataset_TheGreenMile
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 1.11
Predator (1987)
this is the DF dataset_Predator
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 1.27
A Night at the Roxbury (1998)
this is the DF dataset_ANightattheRoxbury
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 0.93
Ed Wo

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline wi

Root mean squared error is 1.01
Star Wars: Episode VII - The Force Awakens (2015)
this is the DF dataset_StarWars:EpisodeVII-TheForceAwakens
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 0.96
The Transporter (2002)
this is the DF dataset_TheTransporter
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 0.99
Cast Away (2000)
this is the DF dataset_CastAway
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 0.92
Bad Boys 2 (2003)
this is the DF dataset_BadBoys2
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 1.04
The Babadook (2014)
this is the DF dataset_TheBabadook
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 1.27
Saw (2004)
this is the DF dataset_Saw
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 1.08
Star Wars: E

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline wi

Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 0.96
Eternal Sunshine of the Spotless Mind (2004)
this is the DF dataset_EternalSunshineoftheSpotlessMind
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 0.97
Ocean's Eleven (2001)
this is the DF dataset_Ocean'sEleven
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 0.9
Men in Black II (2002)
this is the DF dataset_MeninBlackII
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 1.19
Cocktail (1988)
this is the DF dataset_Cocktail
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 0.91
The Wolf of Wall Street (2013)
this is the DF dataset_TheWolfofWallStreet
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 1.1
The Game (1997)
this is the DF dataset_TheGame
-------
Fitting 3 folds for 

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline wi

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline wi

Root mean squared error is 0.96
Runaway Bride (1999)
this is the DF dataset_RunawayBride
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 0.86
Harry Potter and the Goblet of Fire (2005)
this is the DF dataset_HarryPotterandtheGobletofFire
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 1.23
Gods and Generals (2003)
this is the DF dataset_GodsandGenerals
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 0.99
My Father and My Son (2005)
this is the DF dataset_MyFatherandMySon
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 0.93
X-Men 2 (2003)
this is the DF dataset_X-Men2
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean squared error is 1.14
The Usual Suspects (1995)
this is the DF dataset_TheUsualSuspects
-------
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Root mean 

In [44]:
sum(RMSE2)/len(RMSE2) # accuracy - avg RMSE

1.061925609361932

In [48]:
dataset_TheFastandtheFurious

Unnamed: 0,Gender identity (1 = female; 2 = male; 3 = self-described),Are you an only child? (1: Yes; 0: No; -1: Did not respond),Movies are best enjoyed alone (1: Yes; 0: No; -1: Did not respond),The Fast and the Furious (2001),predicted_values
0,1.0,0,1,4.0,2.662277
2,1.0,1,0,2.5,2.674905
4,1.0,1,1,2.5,2.589016
5,1.0,1,0,4.0,2.674905
7,1.0,0,1,1.5,2.662277
...,...,...,...,...,...
1086,1.0,0,1,2.5,2.662277
1087,1.0,0,0,3.5,2.748166
1088,2.0,1,0,1.5,2.488202
1093,1.0,0,0,4.0,2.748166
