In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd

train_data = pd.read_csv('../Codes - Datasets/train_data.txt', sep=':::', names=['movie','genre', 'plot'])
test_data = pd.read_csv('../Codes - Datasets/test_data.txt', sep=':::', names=['movie','plot'])
train_data.head()

Unnamed: 0,movie,genre,plot
1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...


In [2]:
import re

def clean_text(text):
    text = text.lower()
    # Remove special characters and punctuations
    text = re.sub(r"[^a-z0-9\s]", "", text)
    # Remove extra spaces
    text = re.sub(r"\s+", " ", text).strip()
    return text

train_data['clean_plot'] = train_data['plot'].apply(clean_text)
train_data[['movie', 'genre', 'clean_plot']].head(10)  
test_data['clean_plot'] = test_data['plot'].apply(clean_text)
test_data[['movie', 'clean_plot']].head(10)  

Unnamed: 0,movie,clean_plot
1,Edgar's Lunch (1998),lr brane loves his life his car his apartment ...
2,La guerra de papá (1977),spain march 1964 quico is a very naughty child...
3,Off the Beaten Track (2010),one year in the life of albin and his family o...
4,Meu Amigo Hindu (2015),his father has died he hasnt spoken with his b...
5,Er nu zhai (1955),before he was known internationally as a marti...
6,Riddle Room (2016),emily burns is being held captive in a room wi...
7,L'amica (1969),the beautiful but neglected wife of a brillian...
8,Ina Mina Dika (1989),vasu inamdar ina suffers from a disorder where...
9,Equinox Special: Britain's Tornados (2005),an insight into the tornados that hit kensal r...
10,Press (2011),press is a story of young people overwhelmed b...


In [3]:
#label vectorization 
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

X_train = vectorizer.fit_transform(train_data['clean_plot'])
X_test = vectorizer.transform(test_data['clean_plot'])

In [4]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_data['genre'])

In [None]:
#model training
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(class_weight='balanced', max_iter=1000)
model.fit(X_train, y_train)

In [None]:
#prediction on test_data
y_pred = model.predict(X_test)
predicted_genres = label_encoder.inverse_transform(y_pred)
test_data['predicted_genre'] = predicted_genres

In [None]:
test_data[['movie','plot','predicted_genre']].head()

In [None]:
#train_data['genre'].value_counts()
test_data[['movie', 'plot', 'predicted_genre']].to_csv('predicted_genres.csv', index=False)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Split the training data
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

# Train on 80%
model.fit(X_train_split, y_train_split)

# Predict on 20% validation set
y_val_pred = model.predict(X_val_split)

# Accuracy and report
acc = accuracy_score(y_val_split, y_val_pred)
print(f"Validation Accuracy: {acc:.2f}")

print("\nClassification Report:")
print(classification_report(y_val_split, y_val_pred, target_names=label_encoder.classes_))

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

# Initialize and train the model
nb_model = MultinomialNB()
nb_model.fit(X_train_split, y_train_split)

# Predict on validation data
y_val_pred_nb = nb_model.predict(X_val_split)

# Evaluate
nb_acc = accuracy_score(y_val_split, y_val_pred_nb)
print(f"Naive Bayes Validation Accuracy: {nb_acc:.2f}")
print("\nNaive Bayes Classification Report:")
print(classification_report(y_val_split, y_val_pred_nb, target_names=label_encoder.classes_))


In [None]:
from sklearn.svm import LinearSVC

# Initialize and train the model
svm_model = LinearSVC(class_weight='balanced', max_iter=10000)
svm_model.fit(X_train_split, y_train_split)

# Predict on validation data
y_val_pred_svm = svm_model.predict(X_val_split)

# Evaluate
svm_acc = accuracy_score(y_val_split, y_val_pred_svm)
print(f"SVM Validation Accuracy: {svm_acc:.2f}")
print("\nSVM Classification Report:")
print(classification_report(y_val_split, y_val_pred_svm, target_names=label_encoder.classes_))


In [None]:
# Naive Bayes Test Predictions
y_pred_nb_test = nb_model.predict(X_test)
test_data['predicted_genre_nb'] = label_encoder.inverse_transform(y_pred_nb_test)

# SVM Test Predictions
y_pred_svm_test = svm_model.predict(X_test)
test_data['predicted_genre_svm'] = label_encoder.inverse_transform(y_pred_svm_test)

# Save or inspect results
test_data[['movie', 'plot', 'predicted_genre', 'predicted_genre_nb', 'predicted_genre_svm']].head()
