<a href="https://colab.research.google.com/github/sidharkal/-movie-genre-prediction/blob/main/movie_genre_prediction_with_distilbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers datasets evaluate



In [2]:
from huggingface_hub import login
login(token="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")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
from datasets import load_dataset
data = load_dataset("datadrivenscience/movie-genre-prediction")
data



  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'movie_name', 'synopsis', 'genre'],
        num_rows: 54000
    })
    test: Dataset({
        features: ['id', 'movie_name', 'synopsis', 'genre'],
        num_rows: 36000
    })
})

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from datasets import Dataset
from sklearn.metrics import confusion_matrix
import datasets
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
import evaluate
from transformers import create_optimizer
import tensorflow as tf
from transformers.keras_callbacks import KerasMetricCallback
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [5]:
train = pd.DataFrame(data['train'])
test = pd.DataFrame(data['test'])
train.head()

Unnamed: 0,id,movie_name,synopsis,genre
0,44978,Super Me,A young scriptwriter starts bringing valuable ...,fantasy
1,50185,Entity Project,A director and her friends renting a haunted h...,horror
2,34131,Behavioral Family Therapy for Serious Psychiat...,This is an educational video for families and ...,family
3,78522,Blood Glacier,Scientists working in the Austrian Alps discov...,scifi
4,2206,Apat na anino,Buy Day - Four Men Widely - Apart in Life - By...,action


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54000 entries, 0 to 53999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          54000 non-null  int64 
 1   movie_name  54000 non-null  object
 2   synopsis    54000 non-null  object
 3   genre       54000 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.6+ MB


In [7]:
train.shape,test.shape

((54000, 4), (36000, 4))

In [8]:
label2id  = {'fantasy':0, 'horror':1, 'family':2, 'scifi':3, 'action':4, 'crime':4,
           'adventure':5, 'mystery':6, 'romance':7, 'thriller':8}

id2label  = {value: key for key, value in label2id.items()}

id2label

{0: 'fantasy',
 1: 'horror',
 2: 'family',
 3: 'scifi',
 4: 'crime',
 5: 'adventure',
 6: 'mystery',
 7: 'romance',
 8: 'thriller'}

In [9]:
class_train = [ label2id[i] for i in list(data['train']['genre'])]
class_test = [ label2id[i] for i in list(data['test']['genre'])]

In [10]:
train['data'] = train['movie_name'] + str(':') + train['synopsis']
train['label'] =  class_train

test['data'] = test['movie_name'] + str(':') + test['synopsis']
test['label'] =  class_test

In [11]:
train.head()

Unnamed: 0,id,movie_name,synopsis,genre,data,label
0,44978,Super Me,A young scriptwriter starts bringing valuable ...,fantasy,Super Me:A young scriptwriter starts bringing ...,0
1,50185,Entity Project,A director and her friends renting a haunted h...,horror,Entity Project:A director and her friends rent...,1
2,34131,Behavioral Family Therapy for Serious Psychiat...,This is an educational video for families and ...,family,Behavioral Family Therapy for Serious Psychiat...,2
3,78522,Blood Glacier,Scientists working in the Austrian Alps discov...,scifi,Blood Glacier:Scientists working in the Austri...,3
4,2206,Apat na anino,Buy Day - Four Men Widely - Apart in Life - By...,action,Apat na anino:Buy Day - Four Men Widely - Apar...,4


In [12]:
train.shape

(54000, 6)

In [13]:
X_train, X_val, y_train, y_val = train_test_split(train['data'], train['label'], test_size=0.33, random_state=42,stratify =train['label'])

In [14]:
from datasets.dataset_dict import DatasetDict
from datasets import Dataset

d = {'train':Dataset.from_dict({'label':y_train,'message':X_train}),
     'Val':Dataset.from_dict({'label':y_val,'message':X_val}),
     'test':Dataset.from_dict({'label':test['label'],'message':test['data']})
     }

dataset_huggingface = DatasetDict(d)
dataset_huggingface

DatasetDict({
    train: Dataset({
        features: ['label', 'message'],
        num_rows: 36180
    })
    Val: Dataset({
        features: ['label', 'message'],
        num_rows: 17820
    })
    test: Dataset({
        features: ['label', 'message'],
        num_rows: 36000
    })
})

In [15]:
dataset_huggingface["test"][0]

{'label': 4,
 'message': "A Death Sentence:12 y.o. Ida's dad'll die without a DKK1,500,000 operation. Ida plans to steal the money from the bank, her mom installed alarm systems in. She'll need her climbing skills, her 2 friends and 3 go-karts."}

In [16]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [17]:
def preprocess_function(examples):
    return tokenizer(examples["message"], truncation=True)

In [18]:
tokenized_data = dataset_huggingface.map(preprocess_function, batched=True)

Map:   0%|          | 0/36180 [00:00<?, ? examples/s]

Map:   0%|          | 0/17820 [00:00<?, ? examples/s]

Map:   0%|          | 0/36000 [00:00<?, ? examples/s]

In [19]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [20]:
accuracy = evaluate.load("accuracy")

In [21]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro')
    acc = accuracy_score(labels, predictions)
    return {
        'Accuracy': acc,
        'F1': f1,
        'Precision': precision,
        'Recall': recall
    }

In [22]:
batch_size = 64
num_epochs = 5
batches_per_epoch = len(tokenized_data["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

In [23]:
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased",
                                                             num_labels=9, id2label=id2label, label2id=label2id)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [24]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_data["train"],
    shuffle=True,
    batch_size=64,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_data["Val"],
    shuffle=False,
    batch_size=64,
    collate_fn=data_collator,
)

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [25]:
model.compile(optimizer=optimizer)

In [26]:
metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)

callbacks = [metric_callback]

In [27]:
history = model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=2, callbacks=callbacks)

Epoch 1/2
Epoch 2/2


In [28]:
tf_test_set = model.prepare_tf_dataset(
    tokenized_data["test"],
    shuffle=False,
    batch_size=64,
    collate_fn=data_collator,
)

In [29]:
prediction =  model.predict(tf_test_set)



In [30]:
prediction[0].shape

(36000, 9)

In [31]:
predict = []
for i in range(test.shape[0]):
  max = np.argmax(prediction[0][i])
  predict.append(id2label[max])

In [32]:
test.head()

Unnamed: 0,id,movie_name,synopsis,genre,data,label
0,16863,A Death Sentence,"12 y.o. Ida's dad'll die without a DKK1,500,00...",action,A Death Sentence:12 y.o. Ida's dad'll die with...,4
1,48456,Intermedio,A group of four teenage friends become trapped...,action,Intermedio:A group of four teenage friends bec...,4
2,41383,30 Chua Phai Tet,A guy left his home for 12 years till he came ...,action,30 Chua Phai Tet:A guy left his home for 12 ye...,4
3,84007,Paranoiac,A man long believed dead returns to the family...,action,Paranoiac:A man long believed dead returns to ...,4
4,40269,Ordinary Happiness,"After a deadly accident, Paolo comes back on E...",action,"Ordinary Happiness:After a deadly accident, Pa...",4


In [33]:
solution = pd.DataFrame({'id':test['id'],'genre':predict})
solution.head()

Unnamed: 0,id,genre
0,16863,family
1,48456,horror
2,41383,family
3,84007,mystery
4,40269,scifi


In [34]:
solution.to_csv("solution.csv",index=False)