In [None]:
# from google.colab import drive; drive.mount('/content/drive')   # OK to enable, if kaggle.json is stored in Google Drive

In [None]:
!pip -q install -U sentence-transformers > log    # install sentence BERT text encoder
!pip -q install -U --force-reinstall --no-deps kaggle >> log  # upgrade kaggle package (to avoid a warning)
!mkdir -p ~/.kaggle                               # .kaggle folder must contain kaggle.json for kaggle executable to properly authenticate you to Kaggle.com
!cp /content/drive/MyDrive/kaggle.json ~/.kaggle/kaggle.json >>log  # First, download kaggle.json from kaggle.com (in Account page) and place it in the root of mounted Google Drive
!cp kaggle.json ~/.kaggle/kaggle.json >> log      # Alternative location of kaggle.json (without a connection to Google Drive)
!chmod 600 ~/.kaggle/kaggle.json                  # give only the owner full read/write access to kaggle.json
!kaggle config set -n competition -v oct312022moviegenres   # set the competition context for the next few kaggle API calls. !kaggle config view - shows current settings
!kaggle competitions download >> log              # download competition dataset as a zip file
!unzip -o *.zip >> log                            # Kaggle dataset is copied as a single file and needs to be unzipped.
!kaggle competitions leaderboard --show           # print public leaderboard

cp: cannot stat '/content/drive/MyDrive/kaggle.json': No such file or directory
- competition is now set to: oct312022moviegenres
Using competition: oct312022moviegenres
No results found


In [None]:
%reset -f
from IPython.core.interactiveshell import InteractiveShell as IS
IS.ast_node_interactivity = "all"    # allows multiple outputs from a cell
import pandas as pd, numpy as np, matplotlib.pyplot as plt, plotly, time
from sklearn.model_selection import train_test_split  # the only allowed function from sklearn
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.linear_model import RidgeClassifier, RidgeClassifierCV
from sklearn.neighbors import RadiusNeighborsClassifier
from sentence_transformers import SentenceTransformer as SBERT  # to encode multilingual text into numeric vectors

pd.set_option('max_rows', 5, 'max_columns', 100, 'max_colwidth', 30, 'precision', 2)
np.set_printoptions(linewidth=100, precision=2, edgeitems=5, suppress=True)
ToCSV = lambda df, fname: df.round(2).to_csv(f'{fname}.csv', index_label='id') # rounds values to 2 decimals

class Timer():
  def __init__(self, lim:'RunTimeLimit'=60): self.t0, self.lim, _ = time.time(), lim, print(f'⏳ started. You have {lim} sec. Good luck!')
  def ShowTime(self):
    msg = f'Runtime is {time.time()-self.t0:.0f} sec'
    print(f'\033[91m\033[1m' + msg + f' > {self.lim} sec limit!!!\033[0m' if (time.time()-self.t0-1) > self.lim else msg)

XY = pd.read_csv('MovieGenresXY.csv')
nClass = 20                                         # number of output classes/genres
YCols = XY.columns[-nClass:]                        # 20 output columns - movie genre indicators
XNumCols = XY.select_dtypes(include=np.number).drop(YCols, axis=1).columns  # numeric column names
XY[XNumCols] = XY[XNumCols].fillna(0)               # fill numeric columns with zeros
XY['desc'] = XY.overview + '.' + XY.title + '. ' + XY.original_language + '. ' + XY.keywords + '. ' + XY.tagline
vX = XY.query('Action!=Action').drop(YCols, axis=1) # test inputs, movie attributes
tXY = XY.query('Action==Action')                    # training I/O
tX, tY = tXY.drop(YCols, axis=1), tXY[YCols]        # split into training I/O
vX, tX = vX.fillna('na'), tX.fillna('na')           # textual fields are filled with NA text
XY

Unnamed: 0,budget,homepage,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,Foreign,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western,desc
0,16000000,,"[{""id"": 907, ""name"": ""japa...",en,Hachi: A Dog's Tale,A drama based on the true ...,43.22,"[{""name"": ""Grand Army Ente...","[{""iso_3166_1"": ""GB"", ""nam...",2009-06-13,47801389,93.0,"[{""iso_639_1"": ""en"", ""name...",Released,"A true story of faith, dev...",Hachi: A Dog's Tale,7.7,1717,,,,,,,,,,,,,,,,,,,,,A drama based on the true ...
1,5500000,,"[{""id"": 520, ""name"": ""chic...",en,The Sting,Set in the 1930's this int...,28.50,"[{""name"": ""Universal Pictu...","[{""iso_3166_1"": ""US"", ""nam...",1973-12-25,159616327,129.0,"[{""iso_639_1"": ""en"", ""name...",Released,...all it takes is a littl...,The Sting,7.9,622,,,,,,,,,,,,,,,,,,,,,Set in the 1930's this int...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4801,15000000,http://www.biglebowskiblur...,"[{""id"": 418, ""name"": ""whit...",en,The Big Lebowski,"Jeffrey ""The Dude"" Lebowsk...",49.23,"[{""name"": ""Gramercy Pictur...","[{""iso_3166_1"": ""GB"", ""nam...",1998-03-06,46189568,117.0,"[{""iso_639_1"": ""en"", ""name...",Released,Times like these call for ...,The Big Lebowski,7.8,2926,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"Jeffrey ""The Dude"" Lebowsk..."
4802,13000000,,"[{""id"": 4118, ""name"": ""bal...",en,Save the Last Dance,A white midwestern girl mo...,9.97,"[{""name"": ""MTV Films"", ""id...","[{""iso_3166_1"": ""US"", ""nam...",2001-01-12,91038276,112.0,"[{""iso_639_1"": ""en"", ""name...",Released,The Only Person You Need T...,Save the Last Dance,6.3,352,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,A white midwestern girl mo...


In [None]:
tmr = Timer()

⏳ started. You have 60 sec. Good luck!


<hr color=red>

<font size=5>⏳</font> <strong><font color=orange size=5>Your Code, Documentation, Ideas and Timer - All Start Here...</font></strong>

**Student's Section** (between ⏳ symbols): add your code and documentation here.

## **Task 1. Preprocessing Pipeline**
 
Explain elements of your preprocessing pipeline i.e. feature engineering, subsampling, clustering, dimensionality reduction, etc. 
1. Why did you choose these elements? (Something in EDA, prior experience,...? Btw, EDA is not required)
1. How do you evaluate the effectiveness of these elements? 
1. What else have you tried that worked or didn't? 

**Student's answer:**

## **Task 2. Modeling Approach**
Explain your modeling approach, i.e. ideas you tried and why you thought they would be helpful. 

1. How did these decisions guide you in modeling?
1. How do you evaluate the effectiveness of these elements? 
1. What else have you tried that worked or didn't? 

**Student's answer:**

Below is a baseline model that produces the result on Kaggle leaderboard (LB).

[SBERT](https://www.sbert.net) generates 384-dimensional text embedding vectors for each movie's description. See [more models](https://www.sbert.net/docs/pretrained_models.html). Use GPU runtime in Colab for 10-100x speed up.

In [None]:
%time sbert = SBERT('paraphrase-MiniLM-L6-v2')  # load SBERT embedings model to encode textual fields

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/314 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

CPU times: user 3.6 s, sys: 533 ms, total: 4.13 s
Wall time: 15 s


In [None]:
%time tXEmb = np.c_[tX[XNumCols].values, sbert.encode(tX.desc.tolist())] # Set GPU runtime in Colab for 10-100x speed up
%time vXEmb = np.c_[vX[XNumCols].values, sbert.encode(vX.desc)]
print(f'Train embedding matrix size:', tXEmb.shape)
pd.DataFrame(tXEmb[:3,:20], index=tXY.title[:3]).style.background_gradient(cmap='coolwarm')  # show movie description and a few of its embedding features

CPU times: user 5.23 s, sys: 1.06 s, total: 6.29 s
Wall time: 10.7 s
CPU times: user 3.24 s, sys: 31.8 ms, total: 3.27 s
Wall time: 3.92 s
Train embedding matrix size: (2401, 390)


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
The Midnight Meat Train,15000000.0,19.91,3533227.0,98.0,6.0,290.0,-0.08,0.18,-0.04,0.39,0.05,-0.13,0.45,-0.02,0.34,0.02,0.32,0.01,0.12,-0.15
London to Brighton,0.0,3.06,0.0,85.0,6.5,34.0,-0.04,-0.11,0.04,-0.23,0.09,0.25,0.39,0.14,-0.18,0.01,0.3,-0.44,0.13,0.26
The Wash,7000000.0,2.45,10229331.0,93.0,5.3,26.0,0.01,-0.12,0.19,-0.0,-0.15,-0.01,0.19,0.29,0.18,-0.09,-0.12,-0.24,0.1,0.16


In [None]:
m = RidgeClassifier(random_state=0)  # multi-label model
%time m.fit(tXEmb, tY)               # fitting to training I/O

CPU times: user 59.3 ms, sys: 52.5 ms, total: 112 ms
Wall time: 138 ms


  return linalg.solve(A, Xy, sym_pos=True, overwrite_a=True).T


RidgeClassifier(random_state=0)

In [None]:
pY = pd.DataFrame(m.predict(vXEmb), index=range(len(vX)), columns=YCols)  # Don't shuffle observations. Keep original order with index 0,1,2,...
pd.DataFrame(pY.values[:10,:], index=vX.title[:10], columns=YCols).style.background_gradient(cmap='coolwarm', axis=1)
pY.reset_index().rename(columns={'index':'id'}).to_csv('Baseline (Python).csv', index=False)  # baseline submission with 0-1 values for class memembership

Unnamed: 0_level_0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,Foreign,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Hachi: A Dog's Tale,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
The Sting,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
The Book of Life,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
"Dude, Where’s My Car?",1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Bolt,1,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
Saw IV,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
Hall Pass,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
The Ladies Man,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
Jingle All the Way,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Abraham Lincoln: Vampire Hunter,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# **References:**

1. Remember to cite your sources here as well! At the least, your textbook should be cited. Google Scholar allows you to effortlessly copy/paste an APA citation format for books and publications. Also cite StackOverflow, package documentation, and other meaningful internet resources to help your peers learn from these (and to avoid plagiarism claims).

<font size=5>⌛</font> <strong><font color=orange size=5>Do not exceed competition's runtime limit!</font></strong>

<hr color=red>


In [None]:
tmr.ShowTime()    # measure Colab's runtime. Do not remove. Keep as the last cell in your notebook.

Runtime is 30 sec


# 💡**Starter Ideas**

1. Tune model hyperparameters
1. Try to linear and non-linear feature normalization: shift/scale, log, divide features by features (investigate scatterplot matrix)
1. Try higher order feature interactions and polynomial features for the original numeric features. Then identify key features or select key principal components. The final model can be trained on a larger or even full training sample. You can use [PCA](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html) to reduce the feature set
1. Do a thorough EDA: look for feature augmentations that result in suitable decision boundaries between pairs of classes (for example, linear boundary for linear models).
1. Evaluate predictions and focus on poorly predicted "groups":
  1. Strongest misclassifications. E.g. the model is very confident about the wrong label
  1. Evaluate predictions near decision boundaries. Is there a way to separate mixed points near the boundary by, perhaps, introducing additional dimensions or interactions?
1. Do scatter plots show piecewise linear shape? Can a separate linear model be used on each support, or can the pattern be linearized via transformations?
1. Clean up textual fields to remove uninformative text. For example, you can use [`json`](https://docs.python.org/3/library/json.html) and [`re`](https://docs.python.org/3/library/re.html) to retrieve just keywords from the lists of [JSON](https://en.wikipedia.org/wiki/JSON) keywords. This may speed up embeddings and lower noise in output coefficients.
   1. Example: `'[{"id": 907, "name": "japanese"}, ...'` $\to$ `'japanese, loyalty, friendship, ...'`
1. Fill input `NA`'s with more suitable statistic (for example, column or group mean or median)
1. Replaces extreme numeric values (such as zeros) with some statistic (such as mean/median) or a modeled value or `NA` (if the predictive model can handle `NA` inputs)
1. Consider embedding other textual fields, if they appear to relate to genres. For example, some countries or companies may focus on documentaries or on action films
1. Try to find a more [suitable SBERT embedding](https://www.sbert.net/docs/pretrained_models.html
1. Consider embedding "important" (for prediction) textual fields separately and concatenating or summing their vectors.
) (small, fast, trained on related text)
1. Learn about [TMDB](https://www.themoviedb.org/) dataset and [related models](https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=tmdb+machine+learning+model&btnG=).
