## Import libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from joblib import dump, load

# import warnings
# warnings.filterwarnings("ignore")

# pd.options.display.max_columns = 999
# pd.options.display.max_rows = 999
# pd.options.display.max_colwidth = 100

%config InlineBackend.figure_format = "retina"
%matplotlib inline

## Load TED talks data

In [2]:
ted_talks = pd.read_csv("../data/ted_talks_model.csv")

In [3]:
# observe top 5 rows of df
display(ted_talks.head())

# view shape of df
ted_talks.shape

Unnamed: 0,num_comments,is_featured,num_tags,num_resources,num_actions,num_recommend,has_citations,num_languages,duration_min,num_related,...,action_organize,action_participate,action_petition,action_share,action_sign_up,action_subscribe,action_visit,action_volunteer,p_title,p_transcript
0,6,1,6,0,0,0,1,8,14.67,6,...,0,0,0,0,0,0,0,0,food ancestor love,last year wa living indigenous family india on...
1,11,1,7,0,2,4,0,7,11.23,6,...,0,1,0,0,0,0,0,0,stop bystander life,able navigate extraordinary gift nothing like ...
2,16,1,7,0,1,0,0,10,14.18,6,...,0,0,0,0,0,0,0,0,woman lead u freedom justice peace,wa first woman president african nation believ...
3,16,1,5,0,0,2,1,10,14.0,6,...,0,0,0,0,0,0,0,0,beauty complexity finding common ground,story started several year ago wife got compla...
4,26,1,10,0,2,4,1,4,13.18,6,...,0,0,0,0,0,0,0,0,dolly parton led epiphany,want tell search purpose journalist dolly part...


(3963, 492)

## 6.1 Model Preparation

In [4]:
# define predictors and target variable
features = [col for col in ted_talks.columns if col != "avg_m_views"]
X = ted_talks[features]
y = ted_talks["avg_m_views"]

In [5]:
# train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y)

## 6.2 Baseline Model

In [6]:
baseline_preds = y.mean()
print(f"Baseline prediction: {round(baseline_preds,2)} average monthly views")

ted_talks["baseline_preds"] = ted_talks["avg_m_views"].mean()

# rmse             
baseline_rmse = np.sqrt(mean_squared_error(ted_talks["avg_m_views"], ted_talks["baseline_preds"]))
print(f"Baseline score (RMSE): {round(baseline_rmse,4)} average monthly views")

# r2
baseline_r2 = r2_score(ted_talks["avg_m_views"], ted_talks["baseline_preds"])
print(f"Baseline score (R2): {round(baseline_r2,4)}")

ted_talks.drop("baseline_preds", axis=1, inplace=True)

Baseline prediction: 76736.59 average monthly views
Baseline score (RMSE): 122723.0166 average monthly views
Baseline score (R2): -0.0


## 6.3 Modelling

In [7]:
# list of stopwords to remove
title_stopwords = ["could", "like"]
transcript_stopwords = ["thank", "much", "applause", "laughter", "like", "wa", "da", 
                        "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]

In [8]:
# function to optimize model selection process

def run_model(vectorizer, algorithm, param_grid, 
              X_train=X_train, X_test=X_test, 
              y_train=y_train, y_test=y_test):

    if vectorizer == "cvec":
        vec_title = CountVectorizer()
        vec_transcript = CountVectorizer()
    
    if vectorizer == "tvec":
        vec_title = TfidfVectorizer()
        vec_transcript = TfidfVectorizer()

    ss = StandardScaler()

    # establish column transformer
    col_transformer = ColumnTransformer(transformers=
                       [("vec_title", vec_title, "p_title"), 
                        ("vec_transcript", vec_transcript, "p_transcript"), 
                       ("ss", ss, 
                        [c for c in X_train.columns 
                        if c != "p_title" and c != "p_transcript"])])

    # establish pipeline
    pred_pipeline = Pipeline(steps=
                    [("ct", col_transformer), 
                     algorithm])

    # establish search params
    param_grid = param_grid

    # contruct randomised search
    algo_vec_randomsearch = RandomizedSearchCV(estimator = pred_pipeline, 
                                              param_distributions = param_grid, 
                                              cv = 5, 
                                              verbose = 50, 
                                              n_jobs = -1)
    
    # fit model with train set
    algo_vec_randomsearch.fit(X_train, y_train)
    
    # obtain best score
    algo_vec_best_score = algo_vec_randomsearch.best_score_

    # # make predictions on train set
    algo_vec_train_pred = algo_vec_randomsearch.predict(X_train)
    
    # score on train set
    algo_vec_train_r2 = algo_vec_randomsearch.score(X_train, y_train)
    algo_vec_train_rmse = np.sqrt(mean_squared_error(y_train, algo_vec_train_pred))

    # make predictions on test set
    algo_vec_test_pred = algo_vec_randomsearch.predict(X_test)

    # score on test set
    algo_vec_test_r2 = algo_vec_randomsearch.score(X_test, y_test)
    algo_vec_test_rmse = np.sqrt(mean_squared_error(y_test, algo_vec_test_pred))

    # model best params
    algo_vec_best_estimator = algo_vec_randomsearch.best_estimator_
    
    # put results in a df and display scores
    algo_vec_scores = pd.DataFrame({"best_score": round(algo_vec_best_score,4),
                                    "train_score_r2": round(algo_vec_train_r2,4),
                                    "train_score_rmse": round(algo_vec_train_rmse,4),
                                    "test_score_r2": round(algo_vec_test_r2,4),
                                    "test_score_rmse": round(algo_vec_test_rmse,4)}, index=[0])

    return algo_vec_best_estimator, algo_vec_scores

### Multi-Linear Regression with CountVectorizer

In [9]:
mlr_cvec_param_grid = {
    "ct__vec_title__stop_words": [title_stopwords],
    "ct__vec_title__max_features": [300, 500, 750],
    "ct__vec_title__ngram_range": [(1,2),(2,2),(2,3)],
    "ct__vec_transcript__stop_words": [transcript_stopwords],
    "ct__vec_transcript__max_features": [300, 500, 750],
    "ct__vec_transcript__ngram_range": [(1,2),(2,2),(2,3)],
    "ct__vec_transcript__max_df": [0.9]
}

mlr_cvec_est, mlr_cvec_scores = run_model(vectorizer = "cvec",
                                          algorithm = ("lr", LinearRegression()),
                                          param_grid = mlr_cvec_param_grid)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   45.9s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   47.0s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   47.4s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   48.1s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   50.9s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   51.8s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:   51.8s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   53.1s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  1.

In [33]:
print("Multi Linear Regression with CountVectorizer:")
mlr_cvec_scores.rename(index={0: "mlr_cvec"}, inplace=True)
display(mlr_cvec_scores)

Multi Linear Regression with CountVectorizer:


Unnamed: 0,best_score,train_score_r2,train_score_rmse,test_score_r2,test_score_rmse
mlr_cvec,-1.5958,0.6885,67620.3163,-0.4772,154583.1807


### Multi-Linear Regression with TfidfVectorizer

In [11]:
mlr_tvec_param_grid = {
    "ct__vec_title__stop_words": [title_stopwords],
    "ct__vec_title__max_features": [300, 500, 750],
    "ct__vec_title__ngram_range": [(1,2),(2,2),(2,3)],
    "ct__vec_transcript__stop_words": [transcript_stopwords],
    "ct__vec_transcript__max_features": [300, 500, 750],
    "ct__vec_transcript__ngram_range": [(1,2),(2,2),(2,3)],
    "ct__vec_transcript__max_df": [0.9]
}

mlr_tvec_est, mlr_tvec_scores = run_model(vectorizer = "tvec",
                                          algorithm = ("lr", LinearRegression()),
                                          param_grid = mlr_tvec_param_grid)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   30.8s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   32.3s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   32.3s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   51.5s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   54.5s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   56.8s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:   56.8s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   58.4s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  1.

In [34]:
print("Multi Linear Regression with TfidfVectorizer:")
mlr_tvec_scores.rename(index={0: "mlr_tvec"}, inplace=True)
display(mlr_tvec_scores)

Multi Linear Regression with TfidfVectorizer:


Unnamed: 0,best_score,train_score_r2,train_score_rmse,test_score_r2,test_score_rmse
mlr_tvec,-1.3865,0.7324,62672.1254,-0.4823,154848.85


### Ridge Regression with CountVectorizer

In [13]:
ridge_cvec_param_grid = {
    "ct__vec_title__stop_words": [title_stopwords],
    "ct__vec_title__max_features": [300, 500, 750],
    "ct__vec_title__ngram_range": [(1,2),(2,2),(2,3)],
    "ct__vec_transcript__stop_words": [transcript_stopwords],
    "ct__vec_transcript__max_features": [300, 500, 750],
    "ct__vec_transcript__ngram_range": [(1,2),(2,2),(2,3)],
    "ct__vec_transcript__max_df": [0.9],
    "ridge__alpha": np.logspace(0, 5, 200)
}

ridge_cvec_est, ridge_cvec_scores = run_model(vectorizer = "cvec",
                                              algorithm = ("ridge", Ridge()),
                                              param_grid = ridge_cvec_param_grid)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   21.1s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   21.7s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   21.8s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   43.7s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   44.2s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   45.8s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:   46.2s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   46.3s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   48.6s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   50.7s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  1.

In [35]:
print("Ridge Regression with CountVectorizer:")
ridge_cvec_scores.rename(index={0: "ridge_cvec"}, inplace=True)
display(ridge_cvec_scores)

Ridge Regression with CountVectorizer:


Unnamed: 0,best_score,train_score_r2,train_score_rmse,test_score_r2,test_score_rmse
ridge_cvec,0.2501,0.41,93064.9964,0.3324,103919.4555


### Ridge Regression with TfidfVectorizer

In [15]:
ridge_tvec_param_grid = {
    "ct__vec_title__stop_words": [title_stopwords],
    "ct__vec_title__max_features": [300, 500, 750],
    "ct__vec_title__ngram_range": [(1,2),(2,2),(2,3)],
    "ct__vec_transcript__stop_words": [transcript_stopwords],
    "ct__vec_transcript__max_features": [300, 500, 750],
    "ct__vec_transcript__ngram_range": [(1,2),(2,2),(2,3)],
    "ct__vec_transcript__max_df": [0.9],
    "ridge__alpha": np.logspace(0, 5, 200)
}

ridge_tvec_est, ridge_tvec_scores = run_model(vectorizer = "tvec",
                                              algorithm = ("ridge", Ridge()),
                                              param_grid = ridge_tvec_param_grid)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   19.7s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   20.7s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   20.8s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   22.9s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   23.1s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   23.3s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:   24.0s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   24.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   39.4s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   40.4s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:   44.0s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:   44.6s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:   46.1s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:   4

In [36]:
print("Ridge Regression with TfidfVectorizer:")
ridge_tvec_scores.rename(index={0: "ridge_tvec"}, inplace=True)
display(ridge_tvec_scores)

Ridge Regression with TfidfVectorizer:


Unnamed: 0,best_score,train_score_r2,train_score_rmse,test_score_r2,test_score_rmse
ridge_tvec,0.2561,0.3919,94475.5027,0.3245,104535.3493


### Lasso Regression with CountVectorizer

In [25]:
lasso_cvec_param_grid = {
    "ct__vec_title__stop_words": [title_stopwords],
    "ct__vec_title__max_features": [300, 500, 750],
    "ct__vec_title__ngram_range": [(1,2),(2,2),(2,3)],
    "ct__vec_transcript__stop_words": [transcript_stopwords],
    "ct__vec_transcript__max_features": [300, 500, 750],
    "ct__vec_transcript__ngram_range": [(1,2),(2,2),(2,3)],
    "ct__vec_transcript__max_df": [0.9],
    "lasso__alpha": np.linspace(0.01, 1, 50),
    "lasso__max_iter": [100000]
}

lasso_cvec_est, lasso_cvec_scores = run_model(vectorizer = "cvec",
                                              algorithm = ("lasso", Lasso()),
                                              param_grid = lasso_cvec_param_grid)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  8.3min
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed: 13.8min
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed: 13.8min
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed: 15.0min
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed: 16.0min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed: 16.8min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed: 17.2min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed: 18.2min
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed: 18.3min
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed: 19.0min
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed: 19.0min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed: 24.

In [37]:
print("Lasso Regression with CountVectorizer:")
lasso_cvec_scores.rename(index={0: "lasso_cvec"}, inplace=True)
display(lasso_cvec_scores)

Lasso Regression with CountVectorizer:


Unnamed: 0,best_score,train_score_r2,train_score_rmse,test_score_r2,test_score_rmse
lasso_cvec,-0.0692,0.5574,80602.0086,0.2006,113713.2447


### Lasso Regression with TfidfVectorizer

In [27]:
lasso_tvec_param_grid = {
    "ct__vec_title__stop_words": [title_stopwords],
    "ct__vec_title__max_features": [300, 500, 750],
    "ct__vec_title__ngram_range": [(1,2),(2,2),(2,3)],
    "ct__vec_transcript__stop_words": [transcript_stopwords],
    "ct__vec_transcript__max_features": [300, 500, 750],
    "ct__vec_transcript__ngram_range": [(1,2),(2,2),(2,3)],
    "ct__vec_transcript__max_df": [0.9],
    "lasso__alpha": np.linspace(0.01, 1, 50),
    "lasso__max_iter": [100000]
}

lasso_tvec_est, lasso_tvec_scores = run_model(vectorizer = "tvec",
                                              algorithm = ("lasso", Lasso()),
                                              param_grid = lasso_tvec_param_grid)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  7.8min
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:  8.4min
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:  8.6min
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  9.3min
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:  9.4min
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed: 10.1min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed: 11.2min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed: 11.2min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed: 11.4min
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed: 12.6min
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed: 12.8min
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed: 15.5min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed: 18.

In [38]:
print("Lasso Regression with TfidfVectorizer:")
lasso_tvec_scores.rename(index={0: "lasso_tvec"}, inplace=True)
display(lasso_tvec_scores)

Lasso Regression with TfidfVectorizer:


Unnamed: 0,best_score,train_score_r2,train_score_rmse,test_score_r2,test_score_rmse
lasso_tvec,-0.2029,0.5968,76931.0618,0.1396,117975.598


### Random Forest with CountVectorizer

In [21]:
rf_cvec_param_grid = {
    "ct__vec_title__stop_words": [title_stopwords],
    "ct__vec_title__max_features": [300, 500, 750],
    "ct__vec_title__ngram_range": [(1,2),(2,2),(2,3)],
    "ct__vec_transcript__stop_words": [transcript_stopwords],
    "ct__vec_transcript__max_features": [300, 500, 750],
    "ct__vec_transcript__ngram_range": [(1,2),(2,2),(2,3)],
    "ct__vec_transcript__max_df": [0.9],
    "rf__max_depth": [None, 2, 5, 10, 20],
    "rf__n_estimators": [100, 250, 500]
}

rf_cvec_est, rf_cvec_scores = run_model(vectorizer = "cvec",
                                        algorithm = ("rf", RandomForestRegressor()),
                                        param_grid = rf_cvec_param_grid)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   37.5s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   37.8s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   37.9s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  3.

In [39]:
print("Random Forest with CountVectorizer:")
rf_cvec_scores.rename(index={0: "rf_cvec"}, inplace=True)
display(rf_cvec_scores)

Random Forest with CountVectorizer:


Unnamed: 0,best_score,train_score_r2,train_score_rmse,test_score_r2,test_score_rmse
rf_cvec,0.422,0.9152,35272.1267,0.5042,89554.3432


### Random Forest with TfidfVectorizer

In [23]:
rf_tvec_param_grid = {
    "ct__vec_title__stop_words": [title_stopwords],
    "ct__vec_title__max_features": [300, 500, 750],
    "ct__vec_title__ngram_range": [(1,2),(2,2),(2,3)],
    "ct__vec_transcript__stop_words": [transcript_stopwords],
    "ct__vec_transcript__max_features": [300, 500, 750],
    "ct__vec_transcript__ngram_range": [(1,2),(2,2),(2,3)],
    "ct__vec_transcript__max_df": [0.9],
    "rf__max_depth": [None, 2, 5, 10, 20],
    "rf__n_estimators": [100, 250, 500]
}

rf_tvec_est, rf_tvec_scores = run_model(vectorizer = "tvec",
                                        algorithm = ("rf", RandomForestRegressor()),
                                        param_grid = rf_tvec_param_grid)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  7.

In [40]:
print("Random Forest with TfidfVectorizer:")
rf_tvec_scores.rename(index={0: "rf_tvec"}, inplace=True)
display(rf_tvec_scores)

Random Forest with TfidfVectorizer:


Unnamed: 0,best_score,train_score_r2,train_score_rmse,test_score_r2,test_score_rmse
rf_tvec,0.4122,0.9187,34535.5658,0.4792,91788.2187


### Results:

In [48]:
scores_df = pd.concat([mlr_cvec_scores,
                      mlr_tvec_scores,
                      ridge_cvec_scores,
                      ridge_tvec_scores,
                      lasso_cvec_scores,
                      lasso_tvec_scores,
                      rf_cvec_scores,
                      rf_tvec_scores], axis=0)

In [50]:
print(f"Baseline prediction: {round(baseline_preds,2)} average monthly views")
print(f"Baseline score (RMSE): {round(baseline_rmse,4)} average monthly views")
print(f"Baseline score (R2): {round(baseline_r2,4)}")
display(scores_df)

Baseline prediction: 76736.59 average monthly views
Baseline score (RMSE): 122723.0166 average monthly views
Baseline score (R2): -0.0


Unnamed: 0,best_score,train_score_r2,train_score_rmse,test_score_r2,test_score_rmse
mlr_cvec,-1.5958,0.6885,67620.3163,-0.4772,154583.1807
mlr_tvec,-1.3865,0.7324,62672.1254,-0.4823,154848.85
ridge_cvec,0.2501,0.41,93064.9964,0.3324,103919.4555
ridge_tvec,0.2561,0.3919,94475.5027,0.3245,104535.3493
lasso_cvec,-0.0692,0.5574,80602.0086,0.2006,113713.2447
lasso_tvec,-0.2029,0.5968,76931.0618,0.1396,117975.598
rf_cvec,0.422,0.9152,35272.1267,0.5042,89554.3432
rf_tvec,0.4122,0.9187,34535.5658,0.4792,91788.2187


We select **Ridge Regression with Count Vectorizer** for model deployment. The difference between the randomised search best score and training score is small. The r2 and rmse scores on training data also outperform the baseline score. While the random forest regressor and multi linear regression algorithms boast rather high r2 scores on training data, these models are also at risk of overfitting and may not be able to generalise well to unseen data. In addition, as compared to random forest regressors, regression models allow for more granuality when interpreting feature importance in our predictions.

## 6.4 Model Deployment

In [52]:
# fit model on train set
ridge_cvec_est.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('ct',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('vec_title',
                                                  CountVectorizer(analyzer='word',
                                                                  binary=False,
                                                                  decode_error='strict',
                                                                  dtype=<class 'numpy.int64'>,
                                                                  encoding='utf-8',
                                                                  input='content',
                                                                  lowercase=True,
                                                                  max_df=1.0,
                                  

In [53]:
# make predictions on test set
predictions = ridge_cvec_est.predict(X_test)

In [60]:
# score on test set
r2 = ridge_cvec_est.score(X_test, y_test)
print("Test score (R2): " , r2)

rmse = np.sqrt(mean_squared_error(y_test, predictions))
print("Test score (RMSE): ", rmse)

Test score (R2):  0.33241066309384704
Test score (RMSE):  103919.45545237306


There is a slight drop in the r2 of the test set (0.3324) as compared to the r2 of the training data (0.4100). Nonetheless, the r2 and rmse scores of the test set outperform the baseline scores.

## 6.5 Statistical Inference

In [62]:
ridge_cvec_best_ridge = ridge_cvec_est.named_steps["ridge"]
ridge_cvec_best_ridge

Ridge(alpha=1464.9713983072863, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [66]:
ridge_cvec_best_cvec = ridge_cvec_est.named_steps["ct"]
ridge_cvec_best_cvec

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('vec_title',
                                 CountVectorizer(analyzer='word', binary=False,
                                                 decode_error='strict',
                                                 dtype=<class 'numpy.int64'>,
                                                 encoding='utf-8',
                                                 input='content',
                                                 lowercase=True, max_df=1.0,
                                                 max_features=750, min_df=1,
                                                 ngram_range=(1, 2),
                                                 preprocessor=None,
                                                 stop_words=['co...
                                  'num_related', 'title_len', 'description_len',
                                  't

In [68]:
ridge_cvec_coefs = pd.DataFrame(ridge_cvec_best_ridge.coef_)
ridge_cvec_coefs.rename(columns={0: "coef"}, inplace=True)
ridge_cvec_coefs.head()

Unnamed: 0,coef
0,-81.194834
1,-278.911103
2,139.525104
3,525.91639
4,-93.8753


In [70]:
title_words = [ "title_" + w for w in 
               ridge_cvec_best_cvec.named_transformers_["vec_title"].get_feature_names()]

transcript_words = ["transcript_" + w for w in 
                    ridge_cvec_best_cvec.named_transformers_["vec_transcript"].get_feature_names()]

In [71]:
ridge_cvec_features = (title_words + 
                       transcript_words + 
                       [c for c in X_train.columns if c != "p_title" and c != "p_transcript"])

ridge_cvec_coefs["features"] = ridge_cvec_features
ridge_cvec_coefs.head()

Unnamed: 0,coef,features
0,-81.194834,title_according
1,-278.911103,title_across
2,139.525104,title_act
3,525.91639,title_action
4,-93.8753,title_activist


In [73]:
ridge_cvec_coefs["coef_abs"] = ridge_cvec_coefs["coef"].abs()
ridge_cvec_coefs.head()

Unnamed: 0,coef,features,coef_abs
0,-81.194834,title_according,81.194834
1,-278.911103,title_across,278.911103
2,139.525104,title_act,139.525104
3,525.91639,title_action,525.91639
4,-93.8753,title_activist,93.8753


In [77]:
ridge_cvec_coefs.sort_values("coef_abs", ascending=False).head(30)

Unnamed: 0,coef,features,coef_abs
1065,21975.38971,published_year,21975.38971
1140,9971.935548,tag_animation,9971.935548
1206,9194.76942,tag_coronavirus,9194.76942
1056,8934.28948,has_citations,8934.28948
1448,7455.436648,tag_self,7455.436648
1121,6192.871321,tag_TED-Ed,6192.871321
1147,5947.504241,tag_arts,5947.504241
1051,-5736.231928,is_featured,5736.231928
1064,-4943.406715,published_month,4943.406715
1063,4654.462834,published_day,4654.462834


The predictor with the highest weight here is the published year as revealed during the EDA phase with videos published 2020 having the highest median average monthly viewership. This could perhaps be related to other features such as virus and coronavirus tags appearing as features with high weights and is unsurprising current virus pandemic. However, other talk features that contribute to high viewership are instances when the speaker has provided citations about their talks or if it is a featured talk. Beyond attributes of the talks, topics which garner high viewership human rights, innovation, education and sleep. In addition, where there are links for "follow-up actions" whereby the talk site includes resources for the community to participate in the cause or learn more about the topics often lead to higher viewership as well.

## 6.6 Recommendations and Conclusions 

The model is able to generalise well to unseen datat. Topics relating to human rights, innovation, education and sleep lead to higher viewership. In addition, where there are follow-up actions provided on the talk site, these tend to correspond to higher views as well.

Beyond this project, we can consider incorporating speaker-related description to understand the impact of these features on view count as well as how much they affect viewership. As TED also uploads talks from its partner events for which it provides licenses to but leaves the organisation of events and invited speakers up to the independent organisations, we can also how these satellite-held events that TED uploads to its website will perform.