In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Load your dataset (replace 'your_dataset.csv' with your actual file)
data = pd.read_csv('IMDB-Movie-Data.csv')

In [22]:
data.head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0


In [23]:
data.columns

Index(['Rank', 'Title', 'Genre', 'Description', 'Director', 'Actors', 'Year',
       'Runtime (Minutes)', 'Rating', 'Votes', 'Revenue (Millions)',
       'Metascore'],
      dtype='object')

In [24]:
# Drop the specified columns
columns_to_drop = ["Rank", "Director", "Actors", "Year", "Runtime (Minutes)", "Rating", "Votes", "Revenue (Millions)", "Metascore"]
data = data.drop(columns=columns_to_drop)

data.head()
data.columns


Index(['Title', 'Genre', 'Description'], dtype='object')

In [25]:
# Combine movie name and description into a single feature
data['text'] = data['Title'] + ' ' + data['Description']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['Genre'], test_size=0.2, random_state=42)

# Vectorize the text features using TF-IDF
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train a Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train_vec, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_vec)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.06


In [26]:
# Encode labels - Fit on all unique labels to avoid unseen labels in test set
all_labels = pd.concat([y_train, y_test]).unique() # Combine and get unique labels
label_encoder = LabelEncoder()
label_encoder.fit(all_labels) # Fit on all unique labels
y_train = label_encoder.transform(y_train)
y_test = label_encoder.transform(y_test)

# Create a pipeline with TfidfVectorizer and MultinomialNB
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', MultinomialNB())
])

# Train the model
pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

# Function to predict genre
def predict_genre(text):
    prediction = pipeline.predict([text])
    genre = label_encoder.inverse_transform(prediction)[0]
    return genre

In [27]:
# Create dropdown for movie selection
movie_dropdown = widgets.Dropdown(
    options=df['Title'].tolist(),
    description='Movie:',
    disabled=False,
)

# Create text area for description (initially empty)
description_text = widgets.Textarea(
    value='',
    placeholder='Movie description will appear here',
    description='Description:',
    disabled=True
)

# Create button to trigger prediction
predict_button = widgets.Button(
    description='Predict Genre',
    disabled=False,
    button_style='info',
    tooltip='Click to predict genre',
    icon='search'
)

# Function to handle movie selection change
def on_movie_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        selected_movie = change['new']
        description = df[df['Title'] == selected_movie]['Description'].iloc[0]
        description_text.value = description

# Function to handle prediction button click
def on_predict_click(b):
    text = movie_dropdown.value + ' ' + description_text.value
    predicted_genre = predict_genre(text)
    print(f"Predicted Genre: {predicted_genre}")

# Register event handlers
movie_dropdown.observe(on_movie_change)
predict_button.on_click(on_predict_click)

# Display widgets
display(movie_dropdown, description_text, predict_button)


Accuracy: 0.06


Dropdown(description='Movie:', options=('Guardians of the Galaxy', 'Prometheus', 'Split', 'Sing', 'Suicide Squ…

Textarea(value='', description='Description:', disabled=True, placeholder='Movie description will appear here'…

Button(button_style='info', description='Predict Genre', icon='search', style=ButtonStyle(), tooltip='Click to…

Predicted Genre: Drama
