In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings("ignore")

In [2]:
import os
import pandas as pd

folder = 'Genre Classification Dataset'
train_file = 'train_data.txt'
test_file = 'test_data_solution.txt'

def get_dataframe(file_name):
    data = []

    with open(os.path.join(folder, file_name)) as f:

        lines = f.readlines()

        for idx, line in enumerate(lines):
            
            row = line.split(' ::: ')

            try:
                title = row[1].split(' (')[0]
                year = row[1].split(' (')[1].split(')')[0]
                genre = row[2]
                description = row[3]
                data.append([title, year, genre, description])
            except Exception as exp:
                print(exp)

    return pd.DataFrame(data, columns=['title', 'year', 'genre', 'description'])

df_train = get_dataframe(train_file)
df_test = get_dataframe(test_file)

In [4]:
print(f"Size of training dataset: {len(df_train)}, test dataset: {len(df_test)}")
df_train.head()

Size of training dataset: 54214, test dataset: 54200


Unnamed: 0,title,year,genre,description
0,Oscar et la dame rose,2009,drama,Listening in to a conversation between his doc...
1,Cupid,1997,thriller,A brother and sister with a past incestuous re...
2,"Young, Wild and Wonderful",1980,adult,As the bus empties the students for their fiel...
3,The Secret Sin,1915,drama,To help their unemployed father make ends meet...
4,The Unrecovered,2007,drama,The film's title refers not only to the un-rec...


In [5]:
df_test.head()

Unnamed: 0,title,year,genre,description
0,Edgar's Lunch,1998,thriller,"L.R. Brane loves his life - his car, his apart..."
1,La guerra de papá,1977,comedy,"Spain, March 1964: Quico is a very naughty chi..."
2,Off the Beaten Track,2010,documentary,One year in the life of Albin and his family o...
3,Meu Amigo Hindu,2015,drama,"His father has died, he hasn't spoken with his..."
4,Er nu zhai,1955,drama,Before he was known internationally as a marti...


In [6]:
input_features = ['title', 'year', 'description']
output_features = ['genre']

X_train = df_train[input_features]
y_train = df_train[output_features]

Plan for the model classifying genres based on title, year, description.

1. Vectorize the data:
    - genre2vec (straightforward)
    - title2vec (complex)
    - decription2vec (complex)
2. Build 2 models and unittest them:
    - ready-to-use model from a popular library
    - implement LSTM with PyTorch
    - implement a model using transformers ^^

3. Train & debug models:
    - ready-to-use: 0-small # of bugs expected
    - LSTM: moderate # of bugs expected
    - Transformers: high # of bugs expected

4. Test models

In [16]:
import numpy as np

np.unique(df_train['genre']) == np.unique(df_test['genre'])

num_df_train = df_train.copy()
num_df_test = df_test.copy()

In [25]:
from sklearn.preprocessing import OneHotEncoder

genre_encoder = OneHotEncoder()

genre_encoder.fit(df_train['genre'].to_numpy().reshape(-1, 1))

num_df_train['genre'] = genre_encoder.transform(df_train['genre'].to_numpy().reshape(-1, 1))
num_df_test['genre'] = genre_encoder.transform(df_test['genre'].to_numpy().reshape(-1, 1))

TypeError: sparse matrix length is ambiguous; use getnnz() or shape[0]

In [13]:
np.unique(num_df_train['genre'])

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26])

In [46]:
genre_encoder = OneHotEncoder(sparse_output=False)
x = genre_encoder.fit_transform(np.expand_dims(df_train['genre'], axis = 1))

TypeError: OneHotEncoder.__init__() got an unexpected keyword argument 'sparse_output'

In [45]:
x[0]

<1x27 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [48]:
import sklearn
sklearn.__version__

'1.1.3'

In [51]:
!pip install -U sklearn

Collecting sklearn
  Downloading sklearn-0.0.post1.tar.gz (3.6 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ... [?25ldone
[?25h  Created wheel for sklearn: filename=sklearn-0.0.post1-py3-none-any.whl size=2935 sha256=3fcee337527343f6e83abc58e92244a9ffc59995b8de40adcb507f66488852bb
  Stored in directory: /Users/pawelnarkiewicz/Library/Caches/pip/wheels/6f/25/c5/e20b5ae82502257329c72904a6507fe4cfb6fd1cfbcb4c4420
Successfully built sklearn
Installing collected packages: sklearn
Successfully installed sklearn-0.0.post1
