In [6]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score


nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to C:\Users\Sukhendu
[nltk_data]     chakraborty\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Sukhendu
[nltk_data]     chakraborty\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to C:\Users\Sukhendu
[nltk_data]     chakraborty\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [None]:
# Load Training Data
train_data = pd.read_csv("train_data.txt", sep=' ::: ', engine='python', names=['ID', 'Title', 'Genre', 'Description'])

test_data = pd.read_csv("test_data.txt", sep=' ::: ', engine='python', names=['ID', 'Title', 'Description'])

test_solution = pd.read_csv("test_data_solution.txt", sep=' ::: ', engine='python', names=['ID', 'Title', 'Genre', 'Description'])

print(f"Training on {train_data.shape[0]} movies.")
print(f"Testing on {test_data.shape[0]} movies.")

Training on 54214 movies.
Testing on 54200 movies.


In [None]:
stemmer = LancasterStemmer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Lowercase all text
    text = text.lower()
    
    # Remove special characters, URLs, and mentions using Regex
    text = re.sub(r'@\S+', '', text) 
    text = re.sub(r'http\S+', '', text) 
    text = re.sub(r'[^a-zA-Z\s]', '', text) 
    

    words = nltk.word_tokenize(text)
    cleaned_words = [stemmer.stem(w) for w in words if w not in stop_words]
    
    return " ".join(cleaned_words)

In [None]:

train_data['Clean_Description'] = train_data['Description'].apply(clean_text)


test_data['Clean_Description'] = test_data['Description'].apply(clean_text)

# Check the difference (using train_data instead of data)
print("\nOriginal Text (Example):", train_data['Description'].iloc[0])
print("Cleaned Text (Example):", train_data['Clean_Description'].iloc[0])

Cleaning training data...
Cleaning test data...

Original Text (Example): Listening in to a conversation between his doctor and parents, 10-year-old Oscar learns what nobody has the courage to tell him. He only has a few weeks to live. Furious, he refuses to speak to anyone except straight-talking Rose, the lady in pink he meets on the hospital stairs. As Christmas approaches, Rose uses her fantastical experiences as a professional wrestler, her imagination, wit and charm to allow Oscar to live life and love to the full, in the company of his friends Pop Corn, Einstein, Bacon and childhood sweetheart Peggy Blue.
Cleaned Text (Example): list convers doct par yearold osc learn nobody cour tel week liv fury refus speak anyon exceiv straighttalking ros lady pink meet hospit stair christmas approach ros us fantast expery profess wrestl imagin wit charm allow osc liv lif lov ful company friend pop corn einstein bacon child sweetheart peggy blu


In [10]:
# 1. Clean the text (using the function we defined earlier)
train_data['Clean_Description'] = train_data['Description'].apply(clean_text)
test_data['Clean_Description'] = test_data['Description'].apply(clean_text)

# 2. Fit TF-IDF on Training data only, then transform both
tfidf = TfidfVectorizer(max_features=5000)

X_train = tfidf.fit_transform(train_data['Clean_Description'])
y_train = train_data['Genre']

X_test = tfidf.transform(test_data['Clean_Description'])
y_test = test_solution['Genre'] # We take the true labels from the solution file

print("Data preparation complete.")

Data preparation complete.


In [11]:
model = MultinomialNB()
model.fit(X_train, y_train)
print("Model training complete.")

Model training complete.


In [12]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

Accuracy: 0.5100553505535055

Classification Report:

              precision    recall  f1-score   support

      action       0.60      0.07      0.13      1314
       adult       0.54      0.07      0.12       590
   adventure       0.75      0.06      0.11       775
   animation       0.00      0.00      0.00       498
   biography       0.00      0.00      0.00       264
      comedy       0.52      0.40      0.45      7446
       crime       0.00      0.00      0.00       505
 documentary       0.55      0.86      0.67     13096
       drama       0.45      0.82      0.58     13612
      family       0.00      0.00      0.00       783
     fantasy       0.00      0.00      0.00       322
   game-show       0.96      0.13      0.24       193
     history       0.00      0.00      0.00       243
      horror       0.71      0.32      0.44      2204
       music       0.83      0.08      0.15       731
     musical       0.00      0.00      0.00       276
     mystery       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [None]:
def predict_movie_genre(plot):
    
    cleaned_plot = clean_text(plot)
    vectorized_plot = tfidf.transform([cleaned_plot])
    
    #Predict
    prediction = model.predict(vectorized_plot)
    return prediction[0]

#DEMO
sample_plot = "A group of friends go to a haunted house and get chased by a ghost."
print(f"Test Plot: {sample_plot}")
print(f"Predicted Genre: {predict_movie_genre(sample_plot)}")

Test Plot: A group of friends go to a haunted house and get chased by a ghost.
Predicted Genre: horror
