In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from gensim.models import Word2Vec

columns = ['Movie Name', 'Genre', 'Description']
df = pd.read_csv('./Genre Classification Dataset/train_data.txt', delimiter=':::', engine='python', names=columns, index_col=0)
df.head()


Unnamed: 0,Movie Name,Genre,Description
1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...


In [2]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

df['cleaned_description'] = df['Description'].apply(preprocess_text)

df[['Description', 'cleaned_description']].head()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,Description,cleaned_description
1,Listening in to a conversation between his do...,listening conversation doctor parents 10 year ...
2,A brother and sister with a past incestuous r...,brother sister past incestuous relationship cu...
3,As the bus empties the students for their fie...,bus empties students field trip museum natural...
4,To help their unemployed father make ends mee...,help unemployed father make ends meet edith tw...
5,The film's title refers not only to the un-re...,film title refers un recovered bodies ground z...


In [3]:
sentences = [desc.split() for desc in df['cleaned_description']]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

def get_sentence_vector(sentence, model):
    words = sentence.split()
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(100)

df['embedding'] = df['cleaned_description'].apply(lambda x: get_sentence_vector(x, word2vec_model))

df[['cleaned_description', 'embedding']].head()


Unnamed: 0,cleaned_description,embedding
1,listening conversation doctor parents 10 year ...,"[-0.22700901, 0.42165866, 0.28614032, -0.08872..."
2,brother sister past incestuous relationship cu...,"[-0.07480178, 0.09665694, 0.35352898, -0.22687..."
3,bus empties students field trip museum natural...,"[-0.34999132, 0.31772104, 0.16926265, -0.03500..."
4,help unemployed father make ends meet edith tw...,"[-0.31947488, 0.32419622, 0.2662369, -0.240247..."
5,film title refers un recovered bodies ground z...,"[-0.22302276, 0.4126873, 0.2550957, 0.14733137..."


In [4]:
X = np.vstack(df['embedding'].values)
y = df['Genre']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (25656, 100)
X_test shape: (6414, 100)
y_train shape: (25656,)
y_test shape: (6414,)


In [5]:
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train, y_train)

y_pred = svm_classifier.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
# print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.5054568132210789
Classification Report:
                precision    recall  f1-score   support

      action        0.29      0.07      0.12       137
       adult        0.00      0.00      0.00        69
   adventure        0.00      0.00      0.00        95
   animation        0.00      0.00      0.00        58
   biography        0.00      0.00      0.00        33
      comedy        0.41      0.48      0.44       894
       crime        0.00      0.00      0.00        64
 documentary        0.62      0.85      0.72      1563
       drama        0.47      0.75      0.58      1604
      family        0.00      0.00      0.00        98
     fantasy        0.00      0.00      0.00        41
   game-show        0.46      0.61      0.52        18
     history        0.00      0.00      0.00        15
      horror        0.36      0.27      0.31       239
       music        0.51      0.43      0.47        76
     musical        0.00      0.00      0.00        29
     myster

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
