In [40]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [41]:
import os
import pandas as pd

folder = 'Genre Classification Dataset'
train_file = 'train_data.txt'
test_file = 'test_data_solution.txt'

def get_dataframe(file_name):
    data = []

    with open(os.path.join(folder, file_name)) as f:

        lines = f.readlines()

        for idx, line in enumerate(lines):
            
            row = line.split(' ::: ')

            try:
                title = row[1].split(' (')[0]
                year = row[1].split(' (')[1].split(')')[0]
                genre = row[2]
                description = row[3]
                data.append([title, year, genre, description])
            except Exception as exp:
                print(exp)

    return pd.DataFrame(data, columns=['title', 'year', 'genre', 'description'])

df_train = get_dataframe(train_file)
df_test = get_dataframe(test_file)

In [62]:
import numpy as np

print(f"Size of training dataset: {len(df_train)}, test dataset: {len(df_test)}, unique labels: {len(np.unique(df_train['genre']))}")
df_train.head()

Size of training dataset: 54214, test dataset: 54200, unique labels: 27


Unnamed: 0,title,year,genre,description
0,Oscar et la dame rose,2009,drama,Listening in to a conversation between his doc...
1,Cupid,1997,thriller,A brother and sister with a past incestuous re...
2,"Young, Wild and Wonderful",1980,adult,As the bus empties the students for their fiel...
3,The Secret Sin,1915,drama,To help their unemployed father make ends meet...
4,The Unrecovered,2007,drama,The film's title refers not only to the un-rec...


In [43]:
df_test.head()

Unnamed: 0,title,year,genre,description
0,Edgar's Lunch,1998,thriller,"L.R. Brane loves his life - his car, his apart..."
1,La guerra de papá,1977,comedy,"Spain, March 1964: Quico is a very naughty chi..."
2,Off the Beaten Track,2010,documentary,One year in the life of Albin and his family o...
3,Meu Amigo Hindu,2015,drama,"His father has died, he hasn't spoken with his..."
4,Er nu zhai,1955,drama,Before he was known internationally as a marti...


In [44]:
input_features = ['title', 'year', 'description']
output_features = ['genre']

X_train = df_train[input_features]
y_train = df_train[output_features]

Plan for the model classifying genres based on title, year, description.

1. Vectorize the data:
    - genre2vec (straightforward)
    - title2vec (complex)
    - decription2vec (complex)
2. Build 2 models and unittest them:
    - ready-to-use model from a popular library
    - implement LSTM with PyTorch
    - implement a model using transformers ^^

3. Train & debug models:
    - ready-to-use: 0-small # of bugs expected
    - LSTM: moderate # of bugs expected
    - Transformers: high # of bugs expected

4. Test models

In [69]:
def one_hot_encode_categories(dataframe, target):
    return dataframe.join(pd.get_dummies(dataframe[target])).drop(columns=target)

num_df_train = one_hot_encode_categories(df_train, 'genre')
num_df_test = one_hot_encode_categories(df_test, 'genre')

output_features_one_hot = np.unique(df_train[output_features])
print(output_features_one_hot.shape, output_features_one_hot)

(27,) ['action' 'adult' 'adventure' 'animation' 'biography' 'comedy' 'crime'
 'documentary' 'drama' 'family' 'fantasy' 'game-show' 'history' 'horror'
 'music' 'musical' 'mystery' 'news' 'reality-tv' 'romance' 'sci-fi'
 'short' 'sport' 'talk-show' 'thriller' 'war' 'western']


In [70]:
num_df_train.head()

Unnamed: 0,title,year,description,action,adult,adventure,animation,biography,comedy,crime,...,news,reality-tv,romance,sci-fi,short,sport,talk-show,thriller,war,western
0,Oscar et la dame rose,2009,Listening in to a conversation between his doc...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Cupid,1997,A brother and sister with a past incestuous re...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,"Young, Wild and Wonderful",1980,As the bus empties the students for their fiel...,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Secret Sin,1915,To help their unemployed father make ends meet...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,The Unrecovered,2007,The film's title refers not only to the un-rec...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [71]:
num_df_test.head()

Unnamed: 0,title,year,description,action,adult,adventure,animation,biography,comedy,crime,...,news,reality-tv,romance,sci-fi,short,sport,talk-show,thriller,war,western
0,Edgar's Lunch,1998,"L.R. Brane loves his life - his car, his apart...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,La guerra de papá,1977,"Spain, March 1964: Quico is a very naughty chi...",0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,Off the Beaten Track,2010,One year in the life of Albin and his family o...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Meu Amigo Hindu,2015,"His father has died, he hasn't spoken with his...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Er nu zhai,1955,Before he was known internationally as a marti...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [72]:
import spacy
# If not working python run python3 -m spacy download en_core_web_sm
from nltk import sent_tokenize
# If not working run nltk.download()
import numpy as np

nlp = spacy.load("en_core_web_sm")

def lemmatize(description):
    sentences = sent_tokenize(description)
    description_lem = [word.lemma_ for sentence in sentences for word in nlp(sentence)]
    return description_lem

In [82]:
k = 1000
description_train_lem = num_df_train[0:k].apply(lambda x: lemmatize(x['description']), axis = 1)
description_test_lem = num_df_test[0:k].apply(lambda x: lemmatize(x['description']), axis = 1)

In [85]:
title_train_lem = num_df_train[0:k].apply(lambda x: lemmatize(x['title']), axis = 1)
title_test_lem = num_df_test[0:k].apply(lambda x: lemmatize(x['title']), axis = 1)

In [83]:
len(description_train_lem)

1000

In [86]:
df_train_preproc = num_df_train[:k].join(pd.DataFrame({'description_lem': description_train_lem})).join(pd.DataFrame({'title_lem': title_train_lem}))
df_test_preproc = num_df_test[:k].join(pd.DataFrame({'description_lem': description_test_lem})).join(pd.DataFrame({'title_lem': title_test_lem}))

input_features_preproc = ['description_lem', 'title', 'year']
output_features_preproc = output_features_one_hot

df_train_preproc

Unnamed: 0,title,year,description,action,adult,adventure,animation,biography,comedy,crime,...,romance,sci-fi,short,sport,talk-show,thriller,war,western,description_lem,title_lem
0,Oscar et la dame rose,2009,Listening in to a conversation between his doc...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"[listen, in, to, a, conversation, between, his...","[Oscar, et, la, dame, rise]"
1,Cupid,1997,A brother and sister with a past incestuous re...,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,"[a, brother, and, sister, with, a, past, inces...",[cupid]
2,"Young, Wild and Wonderful",1980,As the bus empties the students for their fiel...,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"[as, the, bus, empty, the, student, for, their...","[young, ,, wild, and, wonderful]"
3,The Secret Sin,1915,To help their unemployed father make ends meet...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"[to, help, their, unemployed, father, make, en...","[the, Secret, sin]"
4,The Unrecovered,2007,The film's title refers not only to the un-rec...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"[the, film, 's, title, refer, not, only, to, t...","[the, Unrecovered]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,Dangerous Orphans,1985,"Harry, Moir and Rossi were like the three musk...",1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"[Harry, ,, Moir, and, Rossi, be, like, the, th...","[dangerous, Orphans]"
996,Counting Backwards,2007,"For some, the lives we have are not always the...",0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,"[for, some, ,, the, life, we, have, be, not, a...","[count, backwards]"
997,Bubble Gum,2011/II,Jamshedpur-based Vedant Rawat lives a middle-c...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"[Jamshedpur, -, base, Vedant, Rawat, live, a, ...","[bubble, gum]"
998,The Hammer of Mara: Mephisto's Wrath,2015,"Following the events of The Hammer of Mara, Ma...",1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,"[follow, the, event, of, the, Hammer, of, Mara...","[the, Hammer, of, Mara, :, Mephisto, 's, wrath]"


In [87]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import common_texts

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

documents_train_desc = [TaggedDocument(doc, [i]) for i, doc in enumerate(df_train_preproc['description_lem'])]
documents_test_desc = [TaggedDocument(doc, [i]) for i, doc in enumerate(df_test_preproc['description_lem'])]
documents_train_title = [TaggedDocument(doc, [i]) for i, doc in enumerate(df_train_preproc['title_lem'])]
documents_test_title = [TaggedDocument(doc, [i]) for i, doc in enumerate(df_test_preproc['title_lem'])]

model_desc = Doc2Vec(vector_size=50, min_count=2, epochs=40)
model_title = Doc2Vec(vector_size=10, min_count=1, epochs=40)

2023-01-06 13:40:33,470 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec<dm/m,d50,n5,w5,mc2,s0.001,t3>', 'datetime': '2023-01-06T13:40:33.469925', 'gensim': '4.3.0', 'python': '3.10.8 | packaged by conda-forge | (main, Nov 22 2022, 08:25:29) [Clang 14.0.6 ]', 'platform': 'macOS-11.7-arm64-arm-64bit', 'event': 'created'}
2023-01-06 13:40:33,471 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec<dm/m,d10,n5,w5,s0.001,t3>', 'datetime': '2023-01-06T13:40:33.471323', 'gensim': '4.3.0', 'python': '3.10.8 | packaged by conda-forge | (main, Nov 22 2022, 08:25:29) [Clang 14.0.6 ]', 'platform': 'macOS-11.7-arm64-arm-64bit', 'event': 'created'}


In [88]:
model_desc.build_vocab(documents_train_desc)
model_title.build_vocab(documents_train_title)

2023-01-06 13:40:52,919 : INFO : collecting all words and their counts
2023-01-06 13:40:52,921 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags
2023-01-06 13:40:52,934 : INFO : collected 13359 word types and 1000 unique tags from a corpus of 1000 examples and 120217 words
2023-01-06 13:40:52,935 : INFO : Creating a fresh vocabulary
2023-01-06 13:40:52,946 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=2 retains 6326 unique words (47.35% of original 13359, drops 7033)', 'datetime': '2023-01-06T13:40:52.946904', 'gensim': '4.3.0', 'python': '3.10.8 | packaged by conda-forge | (main, Nov 22 2022, 08:25:29) [Clang 14.0.6 ]', 'platform': 'macOS-11.7-arm64-arm-64bit', 'event': 'prepare_vocab'}
2023-01-06 13:40:52,947 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=2 leaves 113184 word corpus (94.15% of original 120217, drops 7033)', 'datetime': '2023-01-06T13:40:52.947806', 'gensim': '4.3.0', 'python': '3.10.8 | packaged by co

In [89]:
print(f"Word 'year' appeared {model_desc.wv.get_vecattr('year', 'count')} times in the training corpus.")
print(f"Word 'year' appeared {model_title.wv.get_vecattr('year', 'count')} times in the training corpus.")

Word 'year' appeared 257 times in the training corpus.
Word 'year' appeared 1 times in the training corpus.


In [90]:
model_desc.train(documents_train_desc, total_examples=model_desc.corpus_count, epochs=model_desc.epochs)

2023-01-06 13:41:27,231 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 3 workers on 6326 vocabulary and 50 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2023-01-06T13:41:27.231189', 'gensim': '4.3.0', 'python': '3.10.8 | packaged by conda-forge | (main, Nov 22 2022, 08:25:29) [Clang 14.0.6 ]', 'platform': 'macOS-11.7-arm64-arm-64bit', 'event': 'train'}
2023-01-06 13:41:27,291 : INFO : EPOCH 0: training on 120217 raw words (78136 effective words) took 0.1s, 1332512 effective words/s
2023-01-06 13:41:27,343 : INFO : EPOCH 1: training on 120217 raw words (78307 effective words) took 0.1s, 1530861 effective words/s
2023-01-06 13:41:27,397 : INFO : EPOCH 2: training on 120217 raw words (78209 effective words) took 0.1s, 1505230 effective words/s
2023-01-06 13:41:27,449 : INFO : EPOCH 3: training on 120217 raw words (78367 effective words) took 0.1s, 1537972 effective words/s
2023-01-06 13:41:27,502 : INFO : EPOCH 4: training on 

In [91]:
model_title.train(documents_train_title, total_examples=model_title.corpus_count, epochs=model_title.epochs)

2023-01-06 13:41:45,853 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 3 workers on 2165 vocabulary and 10 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2023-01-06T13:41:45.853497', 'gensim': '4.3.0', 'python': '3.10.8 | packaged by conda-forge | (main, Nov 22 2022, 08:25:29) [Clang 14.0.6 ]', 'platform': 'macOS-11.7-arm64-arm-64bit', 'event': 'train'}
2023-01-06 13:41:45,869 : INFO : EPOCH 0: training on 3749 raw words (4015 effective words) took 0.0s, 308351 effective words/s
2023-01-06 13:41:45,885 : INFO : EPOCH 1: training on 3749 raw words (4030 effective words) took 0.0s, 372928 effective words/s
2023-01-06 13:41:45,902 : INFO : EPOCH 2: training on 3749 raw words (3987 effective words) took 0.0s, 339040 effective words/s
2023-01-06 13:41:45,915 : INFO : EPOCH 3: training on 3749 raw words (4011 effective words) took 0.0s, 373212 effective words/s
2023-01-06 13:41:45,927 : INFO : EPOCH 4: training on 3749 raw words (

In [92]:
vector = model_desc.infer_vector(['only', 'you', 'can', 'prevent', 'forest', '.', 'fires', 'also', 'you', 'need', 'to', 'learn', '.'])
print(vector)


[-0.08359868 -0.11832076 -0.30858186 -0.27466866 -0.4148984  -0.1669489
 -0.04561296  0.23579566 -0.09821264  0.09103481 -0.06887446  0.145755
  0.208039    0.04267541 -0.29062927  0.31337136  0.23823555 -0.40322718
  0.03623195 -0.18021305  0.49135962  0.22844674  0.37351978 -0.18910986
  0.05901324 -0.23063377  0.49833933 -0.3782788  -0.33989197 -0.33011428
  0.52409685  0.13391775 -0.3700377   0.36670953 -0.652365   -0.18384698
  0.04779305 -0.44073358 -0.29831997 -0.48069853  0.22338131  0.05054268
  0.18892121  0.27125472  0.34869316  0.11090005 -0.1925308  -0.52897936
 -0.18416573 -0.02294459]


In [93]:
vector = model_title.infer_vector(['only', 'you', 'can', 'prevent', 'forest'])
print(vector)

[ 0.04481063  0.13160218  0.13666932  0.02029407  0.04107441  0.08313217
  0.29314438  0.15591714 -0.45527855 -0.12219346]


In [94]:
ranks = []
second_ranks = []
for doc_id in range(len(documents_train_desc)):
    inferred_vector = model_desc.infer_vector(documents_train_desc[doc_id].words)
    sims = model_desc.dv.most_similar([inferred_vector], topn=len(model_desc.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])


In [95]:
import collections

counter = collections.Counter(ranks)
print(counter)

Counter({0: 1000})


In [96]:
import random
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(documents_test_desc) - 1)
inferred_vector = model_desc.infer_vector(list(documents_test_desc[doc_id])[0])
sims = model_desc.dv.most_similar([inferred_vector], topn=len(model_desc.dv))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(list(documents_test_desc[doc_id])[0])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model_desc)
for label, index in [('MOST', 0), ('MOST2', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(documents_test_desc[sims[index][0]].words)))


Test Document (229): «Batimam and Robim be two close friend . their real name be Léo and Mário , but they 've get those nickname after dress up for mardi - gras as Batman and Robin . to get some money , they decide to rob a small supermarket , but they meet some reaction and Mário gets shoot by the owner , who be kill by Léo . they run away and find solace in an abandon house . Léo decide to go out to try to get some help for his friend , whose leg be wound , but to no avail . when he return , the two start an argument .»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec<dm/m,d50,n5,w5,mc2,s0.001,t3>:

MOST (413, 0.6061080098152161): «thirteen year old Audrey Shaw be new to Bellweather , California and certain she will never fit in when she meet Krista Rich , the pretty , popular girl she aspire to be " or be with " - she be just not sure . Set against the backdrop of abstinence education and present as Audrey 's real life documentary assemble entirely from her own video clip , instant messag

In [115]:
df_train_preproc['description_num'] = df_train_preproc.apply(lambda x: model_desc.infer_vector(x['description_lem']), axis=1)
df_test_preproc['description_num'] = df_test_preproc.apply(lambda x: model_desc.infer_vector(x['description_lem']), axis=1)

In [116]:
df_train_preproc['title_num'] = df_train_preproc.apply(lambda x: model_title.infer_vector(x['title_lem']), axis=1)
df_test_preproc['title_num'] = df_test_preproc.apply(lambda x: model_title.infer_vector(x['title_lem']), axis=1)

In [142]:
def numeric_year(row):
    try:
        return float(row['year'][0:4])
    except:
        return float(0)
    
df_train_preproc['year_num'] = df_train_preproc.apply(numeric_year, axis=1)
df_test_preproc['year_num'] = df_test_preproc.apply(numeric_year, axis=1)

In [143]:
df_train_preproc.head()

Unnamed: 0,title,year,description,action,adult,adventure,animation,biography,comedy,crime,...,sport,talk-show,thriller,war,western,description_lem,title_lem,description_num,title_num,year_num
0,Oscar et la dame rose,2009,Listening in to a conversation between his doc...,0,0,0,0,0,0,0,...,0,0,0,0,0,"[listen, in, to, a, conversation, between, his...","[Oscar, et, la, dame, rise]","[0.3168606, 0.73860466, -0.7193371, -0.3311353...","[0.100330554, 0.16899207, 0.34675777, 0.096694...",2009.0
1,Cupid,1997,A brother and sister with a past incestuous re...,0,0,0,0,0,0,0,...,0,0,1,0,0,"[a, brother, and, sister, with, a, past, inces...",[cupid],"[-0.618494, 0.57840586, -0.3653022, 0.4022861,...","[0.09445099, 0.11505941, 0.13119431, 0.0337071...",1997.0
2,"Young, Wild and Wonderful",1980,As the bus empties the students for their fiel...,0,1,0,0,0,0,0,...,0,0,0,0,0,"[as, the, bus, empty, the, student, for, their...","[young, ,, wild, and, wonderful]","[-1.0658501, 0.17734432, 1.025234, -1.0661346,...","[0.037205953, 0.12656404, 0.17395878, 0.086817...",1980.0
3,The Secret Sin,1915,To help their unemployed father make ends meet...,0,0,0,0,0,0,0,...,0,0,0,0,0,"[to, help, their, unemployed, father, make, en...","[the, Secret, sin]","[1.4273374, 2.2096736, -0.21864274, 0.13609058...","[0.07300471, 0.057478014, 0.12722696, 0.023133...",1915.0
4,The Unrecovered,2007,The film's title refers not only to the un-rec...,0,0,0,0,0,0,0,...,0,0,0,0,0,"[the, film, 's, title, refer, not, only, to, t...","[the, Unrecovered]","[-0.43177173, -0.079410814, 0.7588936, -1.1227...","[0.08164266, 0.061981197, 0.06609384, 0.068311...",2007.0


2. Build 2 models and unittest them:
    - ready-to-use model from a popular library
    - implement LSTM with PyTorch
    - implement Doc2Vec with NNs (*)
    - implement a model using transformers ^^

3. Train & debug models:
    - ready-to-use: 0-small # of bugs expected
    - LSTM: moderate # of bugs expected
    - Transformers: high # of bugs expected

4. Test models

In [386]:
# Neural Netowrks
import torch
from torch import nn
from torch import functional as F

class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()

        self.layer1 = nn.Linear(61, 64)
        self.activ1 = nn.ReLU()
        self.layer2 = nn.Linear(64, 128)
        self.activ2 = nn.ReLU()
        self.layer3 = nn.Linear(128, 128)
        self.activ3 = nn.ReLU()
        self.layer4 = nn.Linear(128, 64)
        self.activ4 = nn.ReLU()
        self.layer5 = nn.Linear(64, 32)
        self.activ5 = nn.ReLU()
        self.layer6 = nn.Linear(32, 27)
        self.activ6 = nn.Softmax(dim = 1)

    def forward(self, x):

        x = self.activ1(self.layer1(x))
        x = self.activ2(self.layer2(x))
        x = self.activ3(self.layer3(x))
        x = self.activ4(self.layer4(x))
        x = self.activ5(self.layer5(x))
        x = self.activ6(self.layer6(x))

        return x

net = Net()

print(net)

Net(
  (layer1): Linear(in_features=61, out_features=64, bias=True)
  (activ1): ReLU()
  (layer2): Linear(in_features=64, out_features=128, bias=True)
  (activ2): ReLU()
  (layer3): Linear(in_features=128, out_features=128, bias=True)
  (activ3): ReLU()
  (layer4): Linear(in_features=128, out_features=64, bias=True)
  (activ4): ReLU()
  (layer5): Linear(in_features=64, out_features=32, bias=True)
  (activ5): ReLU()
  (layer6): Linear(in_features=32, out_features=27, bias=True)
  (activ6): Softmax(dim=1)
)


In [387]:
x = torch.rand(4, 61)
y = net.forward(x)
y, torch.sum(y) 

(tensor([[0.0296, 0.0420, 0.0298, 0.0425, 0.0399, 0.0369, 0.0393, 0.0385, 0.0301,
          0.0314, 0.0336, 0.0394, 0.0328, 0.0348, 0.0384, 0.0428, 0.0424, 0.0316,
          0.0363, 0.0409, 0.0412, 0.0338, 0.0390, 0.0369, 0.0402, 0.0413, 0.0343],
         [0.0296, 0.0422, 0.0299, 0.0426, 0.0398, 0.0369, 0.0392, 0.0385, 0.0300,
          0.0315, 0.0336, 0.0394, 0.0327, 0.0348, 0.0384, 0.0429, 0.0424, 0.0316,
          0.0362, 0.0409, 0.0412, 0.0340, 0.0390, 0.0369, 0.0401, 0.0414, 0.0343],
         [0.0295, 0.0421, 0.0298, 0.0425, 0.0398, 0.0369, 0.0393, 0.0384, 0.0300,
          0.0314, 0.0336, 0.0394, 0.0327, 0.0348, 0.0385, 0.0429, 0.0424, 0.0316,
          0.0363, 0.0409, 0.0412, 0.0341, 0.0390, 0.0369, 0.0401, 0.0414, 0.0344],
         [0.0295, 0.0422, 0.0298, 0.0426, 0.0398, 0.0370, 0.0393, 0.0385, 0.0300,
          0.0315, 0.0336, 0.0393, 0.0327, 0.0348, 0.0384, 0.0428, 0.0423, 0.0316,
          0.0363, 0.0409, 0.0412, 0.0341, 0.0389, 0.0369, 0.0402, 0.0413, 0.0344]],
        gra

In [388]:
for layer in net.parameters():
    print(layer)
    break

Parameter containing:
tensor([[-0.1191, -0.0734,  0.0677,  ..., -0.0490, -0.0699,  0.0940],
        [-0.0222,  0.0005,  0.0694,  ..., -0.0671,  0.0346, -0.1096],
        [-0.0573,  0.0684,  0.0705,  ...,  0.0504, -0.1097, -0.0275],
        ...,
        [-0.0507,  0.0517,  0.0286,  ..., -0.0593,  0.1070,  0.0243],
        [-0.0912,  0.0592,  0.0491,  ...,  0.0282,  0.0592, -0.0282],
        [-0.0817,  0.1048, -0.0535,  ..., -0.0323, -0.0079,  0.0350]],
       requires_grad=True)


In [389]:
net.zero_grad()

In [390]:
def preprocess_dataframe(dataframe):
    numpy_array = []
    for _, row in dataframe.iterrows():
        numpy_array.append([v for v in np.hstack(row)])
        
    return np.array(numpy_array)

In [391]:
from torch.utils.data import Dataset
from sklearn.preprocessing import StandardScaler

class MyDataset(Dataset):

    def __init__(self, X, y) -> None:
        super().__init__()
        self.X = X
        self.y = y
    
    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

input_features_preproc = ['title_num', 'description_num', 'year_num']
X_train = preprocess_dataframe(df_train_preproc[input_features_preproc])
y_train = preprocess_dataframe(df_train_preproc[output_features_preproc])
X_test = preprocess_dataframe(df_test_preproc[input_features_preproc])
y_test = preprocess_dataframe(df_test_preproc[output_features_preproc])


scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

dataset_train = MyDataset(X_train_scaled, y_train)
dataset_test = MyDataset(X_test_scaled, y_test)


In [392]:
from torch.utils.data import DataLoader

trainloader = DataLoader(dataset_train, batch_size = 64, shuffle=True)
testloader = DataLoader(dataset_test, batch_size = 64, shuffle=True)

In [393]:
X, y = next(iter(trainloader))

In [394]:
X[0], y[0]

(tensor([ 0.9863,  1.6536,  0.7074, -0.1097,  0.1007,  1.1804,  1.3120,  2.0470,
         -1.4191, -2.0231,  0.3889, -0.2735,  0.1379,  0.5786, -0.4416,  2.1631,
         -0.6613, -0.5515, -0.1823, -0.3288, -0.2831,  1.4633,  0.0908,  1.2914,
         -0.2179, -0.3454, -1.0150, -0.2527,  0.9187, -0.0833,  0.7888, -0.0844,
          1.3809,  0.1413,  0.9130,  0.3744,  0.7089,  0.8619,  0.1826,  0.1132,
         -0.0904,  0.3597, -0.3438, -1.3123, -0.1977,  0.3000, -0.9360,  0.7821,
         -1.1398,  0.1302, -0.4121,  0.1427, -0.3574, -0.8915,  0.2864,  0.1160,
         -0.0805, -0.6023, -0.6189,  0.2046,  0.2544], dtype=torch.float64),
 tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
         0, 0, 0]))

In [395]:
X.size(), y.size()

(torch.Size([64, 61]), torch.Size([64, 27]))

In [402]:
def train(model, loss_fn, dataloader, optimizer, epochs):

    for epoch in range(epochs):
        
        epoch_loss = 0

        for batch, (X, y) in enumerate(dataloader):
            
            optimizer.zero_grad()

            y_p = model(X.float())

            loss = loss_fn(y_p.float(), y.float())

            if np.random.uniform(0, 1) < 0.01:
                print(y[0].float(), y_p[0].float(), model(X.float())[0], loss.item())
            
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        print(f"Epoch {epoch} - loss: {epoch_loss/len(dataloader)}")

net = Net()
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adagrad(params=net.parameters(), lr = 1e-3, lr_decay = 1e-5, weight_decay=1e-5)

train(net, loss_fn, trainloader, optimizer, 100)

Epoch 0 - loss: 3.294591471552849
Epoch 1 - loss: 3.291183188557625
Epoch 2 - loss: 3.2728959023952484
Epoch 3 - loss: 3.1912759989500046
Epoch 4 - loss: 3.1298499554395676
Epoch 5 - loss: 3.1113830357789993
Epoch 6 - loss: 3.103951781988144
Epoch 7 - loss: 3.0928209722042084
tensor([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0.]) tensor([6.5951e-04, 1.5937e-02, 1.8563e-04, 9.1542e-04, 1.7343e-03, 5.7758e-03,
        2.7016e-04, 6.4914e-01, 2.5943e-01, 3.8451e-04, 1.5999e-03, 1.0658e-04,
        4.1865e-03, 2.1503e-02, 3.0244e-03, 1.8397e-03, 9.9605e-05, 2.3898e-03,
        6.7627e-03, 7.3271e-04, 1.4461e-03, 7.8234e-03, 6.7100e-03, 9.5654e-04,
        5.5625e-03, 6.1749e-04, 2.0053e-04], grad_fn=<SelectBackward0>) tensor([6.5951e-04, 1.5937e-02, 1.8563e-04, 9.1542e-04, 1.7343e-03, 5.7758e-03,
        2.7016e-04, 6.4914e-01, 2.5943e-01, 3.8451e-04, 1.5999e-03, 1.0658e-04,
        4.1865e-03, 2.1503e-02, 3.0244e-03, 1.

In [365]:
v = [0, 1, 0, 0]
w = [0.3, 0.3, 0.2, 0.2]
x = torch.tensor(v).float()
y = torch.tensor(w).float()
nn.CrossEntropyLoss()(x, y)

tensor(1.4437)

In [335]:
loss = nn.CrossEntropyLoss()
input = torch.randn(3, 5, requires_grad=True)
target = torch.empty(3, dtype=torch.long).random_(5)
output = loss(input, input)
output

tensor(-4.5704, grad_fn=<DivBackward1>)