In [4]:
%pip install nltk

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import nltk
from nltk.corpus import stopwords
import re



Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: C:\Users\sahil\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [5]:
nltk.download('stopwords')

# Load train and test data
train_df = pd.read_csv(r'archive/Genre Classification Dataset/train_data.txt', sep=':::', header=None, names=['ID', 'Title', 'Genre', 'Description'], engine='python')
test_df = pd.read_csv(r'archive/Genre Classification Dataset/test_data.txt', sep=':::', header=None, names=['ID', 'Title', 'Description'], engine='python')



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sahil\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# Clean text function
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    stop_words = set(stopwords.words('english'))
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)



In [7]:
# Apply cleaning to the data
train_df['Description'] = train_df['Description'].apply(clean_text)
test_df['Description'] = test_df['Description'].apply(clean_text)

# Handle missing values
train_df['Genre'].fillna('unknown', inplace=True)
train_df = train_df[train_df['Description'].str.strip().notna()]
test_df = test_df[test_df['Description'].str.strip().notna()]

# Print shapes of the cleaned data
print("Train Data Shape After Cleaning:", train_df.shape)
print("Test Data Shape After Cleaning:", test_df.shape)



Train Data Shape After Cleaning: (54214, 4)
Test Data Shape After Cleaning: (54200, 3)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['Genre'].fillna('unknown', inplace=True)


In [8]:
# Prepare data for training
X_train = train_df['Description']
y_train = train_df['Genre'].str.strip()

# Transform text using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)



In [10]:
# Train the model
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Prepare and predict on the test data
X_test = test_df['Description'].tolist()
X_test_tfidf = tfidf.transform(X_test)
predictions = model.predict(X_test_tfidf)

# Add predictions to the test DataFrame
test_df['Predicted Genre'] = predictions

# Display results
print(test_df[['ID', 'Title', 'Predicted Genre']].head())


   ID                          Title Predicted Genre
0   1          Edgar's Lunch (1998)            drama
1   2      La guerra de papá (1977)            drama
2   3   Off the Beaten Track (2010)      documentary
3   4        Meu Amigo Hindu (2015)            drama
4   5             Er nu zhai (1955)            drama
