In [7]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
import os
import pandas as pd

folder = 'Genre Classification Dataset'
train_file = 'train_data.txt'
test_file = 'test_data_solution.txt'

def get_dataframe(file_name):
    data = []

    with open(os.path.join(folder, file_name)) as f:

        lines = f.readlines()

        for idx, line in enumerate(lines):
            
            row = line.split(' ::: ')

            try:
                title = row[1].split(' (')[0]
                year = row[1].split(' (')[1].split(')')[0]
                genre = row[2]
                description = row[3]
                data.append([title, year, genre, description])
            except Exception as exp:
                print(exp)

    return pd.DataFrame(data, columns=['title', 'year', 'genre', 'description'])

df_train = get_dataframe(train_file)
df_test = get_dataframe(test_file)

In [10]:
print(f"Size of training dataset: {len(df_train)}, test dataset: {len(df_test)}")
df_train.head()

Size of training dataset: 54214, test dataset: 54200


Unnamed: 0,title,year,genre,description
0,Oscar et la dame rose,2009,drama,Listening in to a conversation between his doc...
1,Cupid,1997,thriller,A brother and sister with a past incestuous re...
2,"Young, Wild and Wonderful",1980,adult,As the bus empties the students for their fiel...
3,The Secret Sin,1915,drama,To help their unemployed father make ends meet...
4,The Unrecovered,2007,drama,The film's title refers not only to the un-rec...


In [11]:
df_test.head()

Unnamed: 0,title,year,genre,description
0,Edgar's Lunch,1998,thriller,"L.R. Brane loves his life - his car, his apart..."
1,La guerra de papá,1977,comedy,"Spain, March 1964: Quico is a very naughty chi..."
2,Off the Beaten Track,2010,documentary,One year in the life of Albin and his family o...
3,Meu Amigo Hindu,2015,drama,"His father has died, he hasn't spoken with his..."
4,Er nu zhai,1955,drama,Before he was known internationally as a marti...


In [12]:
input_features = ['title', 'year', 'description']
output_features = ['genre']

X_train = df_train[input_features]
y_train = df_train[output_features]

Plan for the model classifying genres based on title, year, description.

1. Vectorize the data:
    - genre2vec (straightforward)
    - title2vec (complex)
    - decription2vec (complex)
2. Build 2 models and unittest them:
    - ready-to-use model from a popular library
    - implement LSTM with PyTorch
    - implement a model using transformers ^^

3. Train & debug models:
    - ready-to-use: 0-small # of bugs expected
    - LSTM: moderate # of bugs expected
    - Transformers: high # of bugs expected

4. Test models

In [13]:
def one_hot_encode_categories(dataframe, target):
    return dataframe.join(pd.get_dummies(dataframe[target])).drop(columns=target)

num_df_train = one_hot_encode_categories(df_train, 'genre')
num_df_test = one_hot_encode_categories(df_test, 'genre')

In [14]:
num_df_train.head()

Unnamed: 0,title,year,description,action,adult,adventure,animation,biography,comedy,crime,...,news,reality-tv,romance,sci-fi,short,sport,talk-show,thriller,war,western
0,Oscar et la dame rose,2009,Listening in to a conversation between his doc...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Cupid,1997,A brother and sister with a past incestuous re...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,"Young, Wild and Wonderful",1980,As the bus empties the students for their fiel...,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Secret Sin,1915,To help their unemployed father make ends meet...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,The Unrecovered,2007,The film's title refers not only to the un-rec...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
num_df_test.head()

Unnamed: 0,title,year,description,action,adult,adventure,animation,biography,comedy,crime,...,news,reality-tv,romance,sci-fi,short,sport,talk-show,thriller,war,western
0,Edgar's Lunch,1998,"L.R. Brane loves his life - his car, his apart...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,La guerra de papá,1977,"Spain, March 1964: Quico is a very naughty chi...",0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,Off the Beaten Track,2010,One year in the life of Albin and his family o...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Meu Amigo Hindu,2015,"His father has died, he hasn't spoken with his...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Er nu zhai,1955,Before he was known internationally as a marti...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
import spacy
# If not working python run python3 -m spacy download en_core_web_sm
from nltk import sent_tokenize
# If not working run nltk.download()
import numpy as np

nlp = spacy.load("en_core_web_sm")

def lemmatize(description):
    sentences = sent_tokenize(description)
    description_lem = [word.lemma_ for sentence in sentences for word in nlp(sentence)]
    return description_lem

In [84]:
description_train_lem = [lemmatize(num_df_train.loc[i]['description']) for i in range(len(df_train[0:1000]))]
description_test_lem = [lemmatize(num_df_train.loc[i]['description']) for i in range(len(df_test[0:1000]))]

In [85]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import common_texts
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

documents_train = [TaggedDocument(doc, [i]) for i, doc in enumerate(description_train_lem)]
documents_test = [TaggedDocument(doc, [i]) for i, doc in enumerate(description_test_lem)]
model = Doc2Vec(vector_size=50, min_count=2, epochs=40)

2023-01-05 22:52:14,419 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec<dm/m,d50,n5,w5,mc2,s0.001,t3>', 'datetime': '2023-01-05T22:52:14.419514', 'gensim': '4.3.0', 'python': '3.10.8 | packaged by conda-forge | (main, Nov 22 2022, 08:25:29) [Clang 14.0.6 ]', 'platform': 'macOS-11.7-arm64-arm-64bit', 'event': 'created'}


In [86]:
model.build_vocab(documents_train)


2023-01-05 22:52:14,461 : INFO : collecting all words and their counts
2023-01-05 22:52:14,461 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags
2023-01-05 22:52:14,473 : INFO : collected 13359 word types and 1000 unique tags from a corpus of 1000 examples and 120217 words
2023-01-05 22:52:14,473 : INFO : Creating a fresh vocabulary
2023-01-05 22:52:14,484 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=2 retains 6326 unique words (47.35% of original 13359, drops 7033)', 'datetime': '2023-01-05T22:52:14.484397', 'gensim': '4.3.0', 'python': '3.10.8 | packaged by conda-forge | (main, Nov 22 2022, 08:25:29) [Clang 14.0.6 ]', 'platform': 'macOS-11.7-arm64-arm-64bit', 'event': 'prepare_vocab'}
2023-01-05 22:52:14,484 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=2 leaves 113184 word corpus (94.15% of original 120217, drops 7033)', 'datetime': '2023-01-05T22:52:14.484739', 'gensim': '4.3.0', 'python': '3.10.8 | packaged by co

In [87]:
print(f"Word 'year' appeared {model.wv.get_vecattr('year', 'count')} times in the training corpus.")


Word 'year' appeared 257 times in the training corpus.


In [88]:
model.train(documents_train, total_examples=model.corpus_count, epochs=model.epochs)

2023-01-05 22:52:14,590 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 3 workers on 6326 vocabulary and 50 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2023-01-05T22:52:14.590354', 'gensim': '4.3.0', 'python': '3.10.8 | packaged by conda-forge | (main, Nov 22 2022, 08:25:29) [Clang 14.0.6 ]', 'platform': 'macOS-11.7-arm64-arm-64bit', 'event': 'train'}
2023-01-05 22:52:14,639 : INFO : EPOCH 0: training on 120217 raw words (78378 effective words) took 0.0s, 1635059 effective words/s
2023-01-05 22:52:14,688 : INFO : EPOCH 1: training on 120217 raw words (78181 effective words) took 0.0s, 1651847 effective words/s
2023-01-05 22:52:14,739 : INFO : EPOCH 2: training on 120217 raw words (78091 effective words) took 0.0s, 1571120 effective words/s
2023-01-05 22:52:14,877 : INFO : EPOCH 3: training on 120217 raw words (78129 effective words) took 0.1s, 594106 effective words/s
2023-01-05 22:52:14,922 : INFO : EPOCH 4: training on 1

In [89]:
vector = model.infer_vector(['only', 'you', 'can', 'prevent', 'forest', '.', 'fires', 'also', 'you', 'need', 'to', 'learn', '.'])
print(vector)


[ 0.03039514 -0.09620716 -0.36460468 -0.1251611  -0.15589844 -0.21638525
 -0.06199964  0.2630613   0.08590225 -0.08173687  0.01497169  0.40753338
  0.05708568 -0.08460815 -0.36787924  0.11084662  0.27798635 -0.19433294
  0.29197362 -0.10215756  0.47113967  0.22969201  0.46374136 -0.33741337
  0.2220612  -0.1525057   0.29358184 -0.20131418 -0.08077633 -0.19337368
  0.19067386  0.2094886  -0.42541397  0.22059736 -0.5109805   0.18710195
  0.05757324 -0.43431732 -0.56124634 -0.38296095 -0.0530347   0.04964468
  0.28948095  0.3143342   0.5137067   0.16092952 -0.07840962 -0.3617826
 -0.08346063  0.03003765]


In [90]:
ranks = []
second_ranks = []
for doc_id in range(len(documents_train)):
    inferred_vector = model.infer_vector(documents_train[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])


In [91]:
import collections

counter = collections.Counter(ranks)
print(counter)


Counter({0: 1000})


In [94]:
import random
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(documents_test) - 1)
inferred_vector = model.infer_vector(list(documents_test[doc_id])[0])
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(list(documents_test[doc_id])[0])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MOST2', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(documents_train[sims[index][0]].words)))


Test Document (334): «in 1976 , William Wyler become the fourth recipient of the American Film Institute 's Lifetime Achievement Award , follow John Ford , James Cagney and Orson Welles . the winner of three Best Director Academy Awards ( and a record twelve nomination ) , Wyler have direct more oscar - win performance than any other director : Walter Brennan ( twice ) , Bette Davis , Fay Bainter , Greer Garson , Teresa Wright , Fredric March , Harold Russell , Olivia de Havilland , Audrey Hepburn , Burl Ives , Charlton Heston , Hugh Griffith and Barbra Streisand . among the film luminary who pay tribute to Wyler be Audrey Hepburn , Gregory Peck , Myrna Loy , Henry Fonda , James Stewart , Barbra Streisand , Charlton Heston , Eddie Albert , Merle Oberon , Walter Pidgeon , Greer Garson and Harold Russell . Film clip include : " the good year of our life , " " Roman Holiday , " " Ben - Hur , " " Mrs. Miniver , " " Funny Girl , " " Wuthering Heights , " and " the Heiress . " conspicously a

2. Build 2 models and unittest them:
    - ready-to-use model from a popular library
    - implement LSTM with PyTorch
    - implement a model using transformers ^^

3. Train & debug models:
    - ready-to-use: 0-small # of bugs expected
    - LSTM: moderate # of bugs expected
    - Transformers: high # of bugs expected

4. Test models