# Classification task

In [None]:
# installation of packages
!pip install nltk

In [None]:
# installation of NLTK data
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

## Data pre-processing

In [None]:
# Loading data
import pandas as pd
df = pd.read_csv('../data/dataset.csv')
df.head()

In [None]:
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

# defining tokenizer which performs lemmatization and skips stop or/and non-alphabetic words
class LemmaTokenizer:
    def __init__(self):
        self.stops = set(stopwords.words('english'))
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc) if self.wnl.lemmatize(t) not in self.stops and t.isalpha()]
    
# defining TF-IDF vectorizer
# we put threshold of 1% for term presence in summaries so we can filter out the least common terms
# which can cause the overfitting of the classifier (this choice is backed by Zipf's law)
vectorizer = TfidfVectorizer(tokenizer=LemmaTokenizer(), min_df=0.01)

In [None]:
corpus = df['summary'].to_numpy() # extracting the corpus from the dataset
X = vectorizer.fit_transform(corpus) # normalized TF-IDF weights
Y = df['genres'].to_numpy() # genres assigned to works
print(X.shape, Y.shape)