# if a stand-up comedy will receive above or below average IMDb rating

1) Train weak learners: Random Forrest, Stochastic Gradient Descent.

2) Perform a grid search to find optimal parameters for an XGBoost classifier.

3) Put all three models into an ensemble.

In [1]:
import pandas as pd
import numpy as np

from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV

from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')

In [4]:
df=pd.read_csv("frame4.csv")

In [6]:
df

Unnamed: 0,S No.,Tag,URL,Raw Transcript,Names,Title,Year,Transcript,runtime,rating,...,diversity_ratio,Culture,UK,Crimes,Situational,Immigrants,Relationships,Politics,cluster_LDA,cluster_tfidf
0,0,\t\t\t\tTom Segura: Sledgehammer (2023) | Tr...,https://scrapsfromtheloft.com/comedy/tom-segur...,"['[clacking]', '[soft bamboo flute notes play]...",\t\t\t\tTom Segura,\t\t\t\tTom Segura: Sledgehammer,2023,hey hey you tell me how have you been ...,61.0,7.1,...,0.326741,0.001729,0.013285,0.126248,0.014998,0.004348,0.830967,0.008424,5,1
1,1,\t\t\t\tEddie Izzard: Force Majeure Live (20...,https://scrapsfromtheloft.com/comedy/eddie-izz...,['London. Otherwise it would be a little bit t...,\t\t\t\tEddie Izzard,\t\t\t\tEddie Izzard: Force Majeure Live,2013,london otherwise it would be a little bit tric...,85.0,7.3,...,0.317978,0.001048,0.990023,0.000878,0.000503,0.000712,0.002136,0.004701,3,6
2,2,\t\t\t\tMarlon Wayans: You Know What It Is (...,https://scrapsfromtheloft.com/comedy/marlon-wa...,"['[Hip-hop music playing]', '♪♪', '[Cheers and...",\t\t\t\tMarlon Wayans,\t\t\t\tMarlon Wayans: You Know What It Is,2021,miami make some noise for mr marlon wayans...,58.0,4.0,...,0.271009,0.001495,0.002485,0.592501,0.001018,0.001128,0.400080,0.001292,0,5
3,3,\t\t\t\tKyle Kinane: Whiskey Icarus (2012) |...,https://scrapsfromtheloft.com/comedy/kyle-kina...,"['Whoo!', 'I’m Kyle Kinane. Kyle Christian Kin...",\t\t\t\tKyle Kinane,\t\t\t\tKyle Kinane: Whiskey Icarus,2012,whoo im kyle kinane kyle christian kinane yeah...,60.0,7.9,...,0.341096,0.105144,0.001627,0.099213,0.000956,0.003323,0.786317,0.003420,5,1
4,4,\t\t\t\tKyle Kinane: Loose in Chicago (2016)...,https://scrapsfromtheloft.com/comedy/kyle-kina...,"['[indistinct chatter]', 'Ladies and gentlemen...",\t\t\t\tKyle Kinane,\t\t\t\tKyle Kinane: Loose in Chicago,2016,ladies and gentlemen kyle kinane well thank...,77.0,7.1,...,0.360311,0.210421,0.001484,0.102548,0.001601,0.006167,0.656735,0.021043,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
393,421,\t\t\t\tLOUIS C.K.: 2017 – Full transcript\t...,https://scrapsfromtheloft.com/comedy/louis-c-k...,['Louis C.K Netflix special filmed in Washingt...,\t\t\t\tLOUIS C.K.,\t\t\t\tLOUIS C.K.: 2017,2017,louis ck netflix special filmed in washington ...,,,...,0.287472,0.004457,0.001332,0.001433,0.001133,0.001186,0.942120,0.048339,5,4
394,422,\t\t\t\tGeorge Carlin: Jamming in New York (...,https://scrapsfromtheloft.com/comedy/george-ca...,['Jammin’ in New York is George Carlin’s 14th ...,\t\t\t\tGeorge Carlin,\t\t\t\tGeorge Carlin: Jamming in New York,1992,jammin in new york is george carlins album an...,,,...,0.371251,0.996354,0.000596,0.000585,0.000545,0.000675,0.000627,0.000619,1,1
395,424,\t\t\t\tReggie Watts: Spatial (2016) – Full ...,https://scrapsfromtheloft.com/comedy/reggie-wa...,"['Hello, I’m Thomas. I’m so glad to meet you M...",\t\t\t\tReggie Watts,\t\t\t\tReggie Watts: Spatial,2016,hello im thomas im so glad to meet you mum im...,,,...,0.389581,0.001625,0.868851,0.002999,0.001553,0.001522,0.121899,0.001552,3,3
396,425,\t\t\t\tGEORGE CARLIN: COMPLAINTS AND GRIEVA...,https://scrapsfromtheloft.com/comedy/george-ca...,['Complaints and Grievances is a HBO stand-up ...,\t\t\t\tGEORGE CARLIN,\t\t\t\tGEORGE CARLIN: COMPLAINTS AND GRIEVA...,2001,complaints and grievances is a hbo standup spe...,,,...,0.390789,0.995896,0.000746,0.000716,0.000604,0.000631,0.000755,0.000653,1,1


### One-hot features for cluster assignments

In [7]:
cluster_LDA_dummies = pd.get_dummies(df['cluster_LDA'])
LDA_columns = [str(column) + '_LDA' for column in cluster_LDA_dummies.columns]
cluster_LDA_dummies.columns = LDA_columns

cluster_tfidf_dummies = pd.get_dummies(df['cluster_tfidf'])
tfidf_columns = [str(column) + '_tfidf' for column in cluster_tfidf_dummies.columns]
cluster_tfidf_dummies.columns = tfidf_columns

cluster_df = pd.merge(cluster_LDA_dummies, cluster_tfidf_dummies, right_index=True, left_index=True)
cluster_df.head()

Unnamed: 0,0_LDA,1_LDA,2_LDA,3_LDA,4_LDA,5_LDA,6_LDA,0_tfidf,1_tfidf,2_tfidf,3_tfidf,4_tfidf,5_tfidf,6_tfidf
0,False,False,False,False,False,True,False,False,True,False,False,False,False,False
1,False,False,False,True,False,False,False,False,False,False,False,False,False,True
2,True,False,False,False,False,False,False,False,False,False,False,False,True,False
3,False,False,False,False,False,True,False,False,True,False,False,False,False,False
4,False,False,False,False,False,True,False,False,True,False,False,False,False,False


In [8]:
df = pd.merge(df, cluster_df, right_index=True, left_index=True)
df.columns

Index(['S No.', 'Tag', 'URL', 'Raw Transcript', 'Names', 'Title', 'Year',
       'Transcript', 'runtime', 'rating', 'language', 'rating_type', 'words',
       'word_count', 'f_words', 's_words', 'diversity', 'diversity_ratio',
       'Culture', 'UK', 'Crimes', 'Situational', 'Immigrants', 'Relationships',
       'Politics', 'cluster_LDA', 'cluster_tfidf', '0_LDA', '1_LDA', '2_LDA',
       '3_LDA', '4_LDA', '5_LDA', '6_LDA', '0_tfidf', '1_tfidf', '2_tfidf',
       '3_tfidf', '4_tfidf', '5_tfidf', '6_tfidf'],
      dtype='object')

### Split data into training and testing sets and train models.

- Train Random Forest model

- Train SGD model

- Perform grid search and train XGB model

- Create and ensemble of three classifiers

## Only LDA Topic assignments to train the model

In [9]:
X = np.array(df[['Culture', 'UK', 'Crimes', 'Situational', 'Immigrants', 'Relationships', 'Politics']].loc[df.rating > 0])
y = np.array(df.rating_type.loc[df.rating > 0])
print(X.shape)
print(y.shape)

(351, 7)
(351,)


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=1)

In [11]:
# Random Forrest
rf = RandomForestClassifier(n_estimators=101).fit(X_train, y_train)
print(f'RF score: {rf.score(X_test, y_test)}')

RF score: 0.5849056603773585


In [12]:
# SGD
sgd = linear_model.SGDClassifier(loss='modified_huber').fit(X_train, y_train)
print(f'SGD score: {sgd.score(X_test, y_test)}')

SGD score: 0.5660377358490566


In [13]:
xgb = XGBClassifier()
parameters = {
     "eta"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
     "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
     "min_child_weight" : [ 1, 3, 5, 7 ],
     "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
     "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
     }

grid = GridSearchCV(xgb,
                    parameters, n_jobs=4,
                    scoring="neg_log_loss",
                    cv=3)

grid.fit(X_train, y_train)

In [14]:
best_xgb = grid.best_estimator_.fit(X_train, y_train)
print(f'Best params: {grid.best_params_}')
print(f'Best XGB score: {best_xgb.score(X_test, y_test)}')

Best params: {'colsample_bytree': 0.3, 'eta': 0.05, 'gamma': 0.4, 'max_depth': 3, 'min_child_weight': 7}
Best XGB score: 0.5283018867924528


In [16]:
# Ensemble
estimators = [('rf', rf), ('sgd', sgd), ('xgb', best_xgb)]

ensemble = VotingClassifier(estimators, voting='soft')
ensemble.fit(X_train, y_train)
print('Voting Classifier, Ensemble Acc: {}'.format(ensemble.score(X_test, y_test)))

Voting Classifier, Ensemble Acc: 0.5660377358490566


## Only Cluster assignments to train the model

In [17]:
X = np.array(df[['0_LDA', '1_LDA', '2_LDA', '3_LDA',
       '4_LDA', '5_LDA', '6_LDA', '0_tfidf', '1_tfidf', '2_tfidf', '3_tfidf',
       '4_tfidf', '5_tfidf', '6_tfidf']].loc[df.rating > 0])
y = np.array(df.rating_type.loc[df.rating > 0])
print(X.shape)
print(y.shape)

(351, 14)
(351,)


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=1)

In [18]:
# Random Forrest
rf = RandomForestClassifier(n_estimators=101).fit(X_train, y_train)
print(f'RF score: {rf.score(X_test, y_test)}')

RF score: 0.6888888888888889


In [19]:
# SGD
sgd = linear_model.SGDClassifier(loss='modified_huber').fit(X_train, y_train)
print(f'SGD score: {sgd.score(X_test, y_test)}')

SGD score: 0.49056603773584906


In [20]:
# XGBoosting
xgb = XGBClassifier().fit(X_train, y_train)
print(f'XGB score: {xgb.score(X_test, y_test)}')

XGB score: 0.5283018867924528


In [21]:
# Ensemble
estimators = [('rf', rf), ('sgd', sgd), ('xgb', xgb)]

ensemble = VotingClassifier(estimators, voting='soft')
ensemble.fit(X_train, y_train)
print('Voting Classifier, Ensemble Acc: {}'.format(ensemble.score(X_test, y_test)))

Voting Classifier, Ensemble Acc: 0.39622641509433965


## Both cluster assignments and LDA probabilities

In [22]:
X = np.array(df[['Culture', 'UK', 'Crimes', 'Situational', 'Immigrants', 'Relationships', 'Politics', '0_LDA', '1_LDA', '2_LDA', '3_LDA',
                 '4_LDA', '5_LDA', '6_LDA', '0_tfidf', '1_tfidf', '2_tfidf', '3_tfidf',
                 '4_tfidf', '5_tfidf', '6_tfidf']].loc[df.rating > 0])
y = np.array(df.rating_type.loc[df.rating > 0])
print(X.shape)
print(y.shape)

(351, 21)
(351,)


In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=1)

In [25]:
# Random Forrest
rf = RandomForestClassifier(n_estimators=101).fit(X_train, y_train)
print(f'RF score: {rf.score(X_test, y_test)}')

RF score: 0.5660377358490566


In [26]:
# SGD
sgd = linear_model.SGDClassifier(loss='modified_huber').fit(X_train, y_train)
print(f'SGD score: {sgd.score(X_test, y_test)}')

SGD score: 0.5471698113207547


In [27]:
xgb = XGBClassifier()
parameters = {
     "eta"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
     "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
     "min_child_weight" : [ 1, 3, 5, 7 ],
     "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
     "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
     }

grid = GridSearchCV(xgb,
                    parameters, n_jobs=4,
                    scoring="neg_log_loss",
                    cv=3)

grid.fit(X_train, y_train)

In [28]:
best_xgb = grid.best_estimator_.fit(X_train, y_train)
print(f'Best params: {grid.best_params_}')
print(f'Best XGB score: {best_xgb.score(X_test, y_test)}')

Best params: {'colsample_bytree': 0.3, 'eta': 0.05, 'gamma': 0.4, 'max_depth': 3, 'min_child_weight': 7}
Best XGB score: 0.49056603773584906


In [29]:
# Ensemble
estimators = [('rf', rf), ('sgd', sgd), ('xgb', best_xgb)]

ensemble = VotingClassifier(estimators, voting='soft')
ensemble.fit(X_train, y_train)
print('Voting Classifier, Ensemble Acc: {}'.format(ensemble.score(X_test, y_test)))

Voting Classifier, Ensemble Acc: 0.5849056603773585


### The Random Forest performed the best at 0.68 accuracy when taking only cluster assignments.