# Importing libraries and dataset

In [1]:
# imports
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score


In [2]:
# loading and saving data in variables

# Load the train dataset
train_path = r"C:\Users\excel\Downloads\archive (1)\Genre-Classification-Dataset/train_data.txt"
train_data = pd.read_csv(train_path, sep=":::", names=['id','title', 'genre', 'description'], engine="python")
#train_data.head()

# Load the test dataset
test_path = r"C:\Users\excel\Downloads\archive (1)\Genre-Classification-Dataset/test_data.txt"
test_data = pd.read_csv(test_path, sep=":::", names=['id', 'title', 'description'], engine="python")
#test_data.head()

# Load the test solutions
test_solutions_path = r"C:\Users\excel\Downloads\archive (1)\Genre-Classification-Dataset/test_data_solution.txt"
test_solutions = pd.read_csv(test_solutions_path, sep=":::", names=['id', 'title', 'genre', 'description'], engine="python")
#test_solutions.head()

In [3]:
#!pip install ntlk

In [4]:
# preprocessing the decription text for faster processing:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\excel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\excel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\excel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\excel\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

# Preprocessing:

In [6]:
# Function to clean text: remove stopwords, punctuation and lemmatize
def clean_txt(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = "".join([char for char in text if char not in string.punctuation])
    
    # Tokenize the text into words
    words = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    # Remove stopwords and words with length less than 3
    filtered_words = [word for word in words if word not in stop_words and len(word) > 2]
    
    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    
    # Join the lemmatized words back into a string
    clean_text = " ".join(lemmatized_words)
    
    return clean_text



train_data['clean_Description'] = train_data['description'].apply(clean_txt)
test_data['clean_test_Desciption'] = test_data['description'].apply(clean_txt)

In [7]:
train_data

Unnamed: 0,id,title,genre,description,clean_Description
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...,listening conversation doctor parent 10yearold...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous r...,brother sister past incestuous relationship cu...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...,bus empty student field trip museum natural hi...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...,help unemployed father make end meet edith twi...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...,film title refers unrecovered body ground zero...
...,...,...,...,...,...
54209,54210,"""Bonino"" (1953)",comedy,This short-lived NBC live sitcom centered on ...,shortlived nbc live sitcom centered bonino wor...
54210,54211,Dead Girls Don't Cry (????),horror,The NEXT Generation of EXPLOITATION. The sist...,next generation exploitation sister kapa bay s...
54211,54212,Ronald Goedemondt: Ze bestaan echt (2008),documentary,"Ze bestaan echt, is a stand-up comedy about g...",bestaan echt standup comedy growing facing fea...
54212,54213,Make Your Own Bed (1944),comedy,Walter and Vivian live in the country and hav...,walter vivian live country difficult time keep...


In [8]:
# Vectorization (TF-IDF):
vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(train_data['clean_Description'])
X_test = vectorizer.transform(test_data['clean_test_Desciption'])

In [22]:
# train-test split form the training dataset:
X = X_train
y = train_data['genre']

# we don't need a train test split, since we already have distinguished training data and testing data

# Model training

In [23]:
# SVM Model
svm_model = SVC(kernel='linear', C=1, probability=True, class_weight='balanced')
svm_model.fit(X_train, y_train)

# Model Evaluation

In [24]:
print(f"y_test data type: {test_solutions.dtypes}")
print(f"y_pred data type: {type(y_pred)}")

y_test = test_solutions['genre']
print(f"y_test data type: {type(y_test)}")


y_test data type: id              int64
title          object
genre          object
description    object
dtype: object
y_pred data type: <class 'numpy.ndarray'>
y_test data type: <class 'pandas.core.series.Series'>


In [25]:
# Predictions
y_pred = svm_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.5500

Classification Report:
                precision    recall  f1-score   support

      action        0.32      0.52      0.40      1314
       adult        0.49      0.54      0.51       590
   adventure        0.32      0.34      0.33       775
   animation        0.34      0.19      0.24       498
   biography        0.08      0.01      0.02       264
      comedy        0.54      0.58      0.56      7446
       crime        0.19      0.22      0.21       505
 documentary        0.75      0.71      0.73     13096
       drama        0.66      0.54      0.59     13612
      family        0.24      0.26      0.25       783
     fantasy        0.28      0.13      0.18       322
   game-show        0.79      0.63      0.70       193
     history        0.12      0.04      0.06       243
      horror        0.58      0.66      0.62      2204
       music        0.41      0.71      0.52       731
     musical        0.24      0.17      0.20       276
     mystery        0.