2. Build 2 models and unittest them:
    - ready-to-use model from a popular library
    - implement LSTM with PyTorch
    - implement Doc2Vec with NNs (*) - done
    - implement a model using transformers ^^

3. Train & debug models:
    - ready-to-use: 0-small # of bugs expected
    - LSTM: moderate # of bugs expected
    - Transformers: high # of bugs expected

4. Test models

In [1]:
import pandas as pd
df_train = pd.read_csv('Genre Classification Dataset/train_dataframe.csv')
df_train.head()

Unnamed: 0,title,year,genre,description
0,Oscar et la dame rose,2009,drama,Listening in to a conversation between his doc...
1,Cupid,1997,thriller,A brother and sister with a past incestuous re...
2,"Young, Wild and Wonderful",1980,adult,As the bus empties the students for their fiel...
3,The Secret Sin,1915,drama,To help their unemployed father make ends meet...
4,The Unrecovered,2007,drama,The film's title refers not only to the un-rec...


In [2]:
df_test = pd.read_csv('Genre Classification Dataset/test_dataframe.csv')
df_test.head()

Unnamed: 0,title,year,genre,description
0,Edgar's Lunch,1998,thriller,"L.R. Brane loves his life - his car, his apart..."
1,La guerra de papá,1977,comedy,"Spain, March 1964: Quico is a very naughty chi..."
2,Off the Beaten Track,2010,documentary,One year in the life of Albin and his family o...
3,Meu Amigo Hindu,2015,drama,"His father has died, he hasn't spoken with his..."
4,Er nu zhai,1955,drama,Before he was known internationally as a marti...


In [3]:
GENRES_NAMES = [
    'action', 'adult', 'adventure',
    'animation', 'biography', 'comedy', 'crime', 'documentary', 'drama',
    'family', 'fantasy', 'game-show', 'history', 'horror', 'music',
    'musical', 'mystery', 'news', 'reality-tv', 'romance', 'sci-fi',
    'short', 'sport', 'talk-show', 'thriller', 'war', 'western'
]
GENRES_NUMBERS = {} 
for i, v in enumerate(GENRES_NAMES):
    GENRES_NUMBERS[v] = i

In [4]:
def join_text(row):
    return '. '.join([row['title'], row['year'], row['description']])
    
df_train['joined_text'] = df_train.apply(join_text, axis=1)
df_test['joined_text'] = df_test.apply(join_text, axis=1)

In [5]:
from tqdm import tqdm
import spacy

nlp = spacy.blank("en")

def make_save_docs(data, outfile):
    """
    this will take a list of texts and labels 
    and transform them in spacy documents
    
    data: list(tuple(text, label))
    
    returns: List(spacy.Doc.doc)
    """
    
    docs = []
    db = spacy.tokens.DocBin()
    
    for doc, label in tqdm(nlp.pipe(data, as_tuples=True), total = len(data)):
        
        assert type(doc.text) == str, f"{doc.text} isn't of string type"
        assert type(label), f"{label} isn't of string type"
        
        for GENRE in GENRES_NAMES:
            doc.cats[GENRE] = label == GENRE

        docs.append(doc)
        db.add(doc)

    db.to_disk(outfile)
    
    return docs

datatrain = list(df_train[['joined_text', 'genre']].sample(frac=1).itertuples(index=False, name=None))
datatest = list(df_test[['joined_text', 'genre']].sample(frac=1).itertuples(index=False, name=None))

train_data = make_save_docs(datatrain, 'train.spacy')
dev_data = make_save_docs(datatest, 'dev.spacy')

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 54214/54214 [00:25<00:00, 2155.74it/s]
100%|██████████| 54200/54200 [00:22<00:00, 2370.16it/s]


In [8]:
!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [9]:
!python3 -m spacy train config.cfg --output ./output --paths.train train.spacy --paths.dev dev.spacy

[38;5;2m✔ Created output directory: output[0m
[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[1m
[2023-01-11 15:16:14,144] [INFO] Set up nlp object from config
[2023-01-11 15:16:14,150] [INFO] Pipeline: ['tok2vec', 'textcat']
[2023-01-11 15:16:14,151] [INFO] Created vocabulary
[2023-01-11 15:16:15,248] [INFO] Added vectors: en_core_web_lg
[2023-01-11 15:16:16,893] [INFO] Finished initializing nlp object
[2023-01-11 15:17:09,192] [INFO] Initialized pipeline components: ['tok2vec', 'textcat']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'textcat'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS TEXTCAT  CATS_SCORE  SCORE 
---  ------  ------------  ------------  ----------  ------
  0       0          0.00          0.04        0.00    0.00
  0     200         16.62          7.56        1.77    0.02
  0     400         26.88          7.18        1.46    0.01
  0     600          6.61          7.15    