<a href="https://colab.research.google.com/github/shravyagulaigari/Codsoft-task-1/blob/main/Codsoft_Task_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 1. Install necessary libraries
!pip install -q pandas scikit-learn gradio nltk

# 2. Import libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
import gradio as gr
import os

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# 3. Unzip the dataset (Assumes archive (5).zip is uploaded)
print("Unzipping archive...")
!unzip -q "archive (5).zip" -d .

# 4. Define paths and load the training data
BASE_DIR = 'Genre Classification Dataset/'
train_path = os.path.join(BASE_DIR, 'train_data.txt')

# The data uses ':::' as a separator and has no header row
LOAD_PARAMS = {'sep': ':::', 'engine': 'python', 'header': None}
train_df = pd.read_csv(train_path, **LOAD_PARAMS)
train_df.columns = ['ID', 'TITLE', 'GENRE', 'DESCRIPTION']

# Display the first few rows and check data size
print(f"\nTraining Data Loaded. Shape: {train_df.shape}")
print(train_df.head(2))

# Use the DESCRIPTION and GENRE columns for training
X_train = train_df['DESCRIPTION']
y_train = train_df['GENRE']

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Unzipping archive...

Training Data Loaded. Shape: (54214, 4)
   ID                           TITLE       GENRE  \
0   1   Oscar et la dame rose (2009)       drama    
1   2                   Cupid (1997)    thriller    

                                         DESCRIPTION  
0   Listening in to a conversation between his do...  
1   A brother and sister with a past incestuous r...  


In [2]:
# --- Preprocessing Parameters ---
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    """
    Cleans and processes a single plot summary text.
    Steps: Lowercase, remove special chars, remove stop words, and lemmatize.
    """
    # Lowercase
    text = text.lower()

    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize and remove stopwords
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return " ".join(tokens)

# Apply cleaning to the plot summaries
X_train_cleaned = X_train.apply(clean_text)
print("\nPlot summaries cleaned and preprocessed.")


# --- Multi-Label Setup ---
# The dataset has single genre labels, so we'll treat this as a multi-class problem
y_train_list = y_train.apply(lambda x: [g.strip() for g in x.split(',')])

# Initialize MultiLabelBinarizer to convert genres into binary vectors
mlb = MultiLabelBinarizer()
y_train_binarized = mlb.fit_transform(y_train_list)

print(f"Total Unique Genres Found: {len(mlb.classes_)}")
print(f"Binarized Genres Shape: {y_train_binarized.shape}")


Plot summaries cleaned and preprocessed.
Total Unique Genres Found: 27
Binarized Genres Shape: (54214, 27)


In [3]:
# --- Feature Extraction: TF-IDF ---
tfidf_vectorizer = TfidfVectorizer(max_features=25000, ngram_range=(1, 2))
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_cleaned)

print(f"TF-IDF Matrix Shape (Movies x Features): {X_train_tfidf.shape}")


# --- Model Training: Logistic Regression (OvR) ---
# Using OneVsRestClassifier for robustness
classifier = OneVsRestClassifier(
    LogisticRegression(solver='liblinear', random_state=42, C=5.0)
)

print("\nStarting Model Training (Logistic Regression with OvR)...")
classifier.fit(X_train_tfidf, y_train_binarized)
print("Model Training Complete!")

TF-IDF Matrix Shape (Movies x Features): (54214, 25000)

Starting Model Training (Logistic Regression with OvR)...
Model Training Complete!


In [4]:
def predict_genre(plot_summary: str) -> dict:
    """
    Takes a plot summary and predicts the top 5 movie genres with confidence scores.
    Returns a dictionary suitable for Gradio's gr.Label output.
    """
    if not plot_summary:
        return {"Please enter a plot summary.": 0}

    try:
        # 1. Preprocess and Transform the input text
        cleaned_input = clean_text(plot_summary)
        input_tfidf = tfidf_vectorizer.transform([cleaned_input])

        # 2. Get probability predictions from the classifier (required for confidence display)
        # Note: Logistic Regression in OvR mode gives true probabilities when using predict_proba.
        predictions_probs = classifier.predict_proba(input_tfidf)

        # 3. Extract scores and map to genre names
        probs = predictions_probs[0]
        genre_probs = {genre: prob for genre, prob in zip(mlb.classes_, probs)}

        # 4. Sort and select top 5 for the Label output
        sorted_genres_probs = sorted(genre_probs.items(), key=lambda item: item[1], reverse=True)

        # 5. Format for Gradio Label output {label: confidence}
        # Gradio Label expects confidence values between 0 and 1.
        output_dict = {label: float(prob) for label, prob in sorted_genres_probs[:5]}

        return output_dict

    except Exception as e:
        return {"Prediction Error": 0}

In [5]:
# --- Define Example Plots ---
examples = [
    ["A retired CIA agent hunts down the men who kidnapped his daughter in Paris, using his 'very particular set of skills'."],
    ["Two slackers spend their summer debating philosophy, questioning consumerism, and eventually finding love in a small Texas town."],
    ["A historical documentary exploring the rise and fall of the Roman Empire through recently discovered archaeological evidence and expert interviews."],
    ["A detective with a dark past investigates a series of brutal murders, which appear to be connected to a sinister, ancient cult."]
]

# --- Create the Gradio Interface with Enhanced UI ---
with gr.Blocks(theme=gr.themes.Soft(), title="Movie Genre Predictor") as demo:
    gr.Markdown(
        """
        # ðŸŽ¬ Movie Genre Predictor
        Enter a plot summary below and the machine learning model will predict the **top 5 most likely genres** based on text analysis of the plot summary.
        """
    )

    with gr.Row():
        # Input Component (larger textbox)
        plot_input = gr.Textbox(
            lines=8,
            label="Enter Movie Plot Summary",
            placeholder="e.g., A young wizard attends a magical school and discovers his destiny to fight an evil sorcerer."
        )

        # Output Component (Label for displaying probabilities)
        genre_output = gr.Label(
            label="Top 5 Predicted Genres (Confidence Score)",
            num_top_classes=5
        )

    # Button to trigger prediction
    predict_button = gr.Button("Predict Genre")
    predict_button.click(fn=predict_genre, inputs=plot_input, outputs=genre_output)

    # Examples below the main components
    gr.Examples(
        examples=examples,
        inputs=plot_input,
        label="Try these example plots:"
    )

# Launch the interface
# setting share=True generates a public, temporary link for external access
demo.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://c82ad25d2e0495820d.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


