<a href="https://colab.research.google.com/github/rashmika-ks/CODSOFT/blob/main/ML_Task1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install pandas scikit-learn nltk



In [2]:
import pandas as pd
import re

def load_train_data(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    data = []
    for line in lines:
        parts = line.strip().split(' ::: ')
        if len(parts) == 4:
            movie_id, title, genre, description = parts
            data.append((int(movie_id), title, genre, description))
    return pd.DataFrame(data, columns=["ID", "Title", "Genre", "Description"])

def load_test_data(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    data = []
    for line in lines:
        parts = line.strip().split(' ::: ')
        if len(parts) == 3:
            movie_id, title, description = parts
            data.append((int(movie_id), title, description))
    return pd.DataFrame(data, columns=["ID", "Title", "Description"])

train_df = load_train_data('train_data.txt')
test_df = load_test_data('test_data.txt')


In [3]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
train_df['GenreEncoded'] = label_encoder.fit_transform(train_df['Genre'])

train_df['Text'] = train_df['Title'] + ' ' + train_df['Description']
test_df['Text'] = test_df['Title'] + ' ' + test_df['Description']


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english', max_features=10000, ngram_range=(1, 2))
X_train = tfidf.fit_transform(train_df['Text'])
X_test = tfidf.transform(test_df['Text'])
y_train = train_df['GenreEncoded']


In [5]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


In [6]:
predictions_encoded = model.predict(X_test)
predicted_genres = label_encoder.inverse_transform(predictions_encoded)
test_df['PredictedGenre'] = predicted_genres

In [11]:
output = test_df[['ID', 'Title', 'PredictedGenre', 'Description']]
output_formatted = output.apply(lambda x: f"{x['ID']} ::: {x['Title']} ::: {x['PredictedGenre']} ::: {x['Description']}", axis=1)
with open('predicted_test_data.txt', 'w', encoding='utf-8') as f:
    f.write("\n".join(output_formatted))


In [13]:
import joblib

joblib.dump(model, 'genre_classifier_model.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')

['label_encoder.pkl']

In [14]:
import joblib

# Load the saved components
model = joblib.load('genre_classifier_model.pkl')
tfidf = joblib.load('tfidf_vectorizer.pkl')
label_encoder = joblib.load('label_encoder.pkl')

def predict_genre(title, description):
    text = title + ' ' + description
    features = tfidf.transform([text])
    predicted_label_encoded = model.predict(features)[0]
    predicted_genre = label_encoder.inverse_transform([predicted_label_encoded])[0]
    return predicted_genre

In [15]:
title = input("Enter movie title: ")
description = input("Enter movie plot summary: ")
print("Predicted Genre:", predict_genre(title, description))


Enter movie title: Mini-Skirt Love (1967)
Enter movie plot summary: Billy is a teenager and enthusiastic photo hobbyist who takes pictures of his mother carrying on with another man, but is too naive to realize his mother is engaging in more than innocent horseplay. When he innocently shows his pictures to Daddy, he unwittingly starts a melee that ends in Daddy's accidental death and Mommy's mental collapse and confinement in a hospital. Enter Aunt Janet, Daddy's sister, to take care of Billy. Supposedly a frigid woman who "can't stand men", Aunt Janet is nevertheless loving and affectionate with young Billy, and when she sees the fatal snapshots, she even takes their relationship to the next level. But can their happy life together survive the return from hospital of a still-disturbed Mommy?
Predicted Genre: drama


In [16]:
title = input("Enter movie title: ")
description = input("Enter movie plot summary: ")
print("Predicted Genre:", predict_genre(title, description))


Enter movie title: Something About Ryan (2009)
Enter movie plot summary: "Something About Ryan" chronicles the misadventures of Ryan Nuzzolo, a live-at-home slacker from Staten Island, who doesn't want to grow up and move out of his parent's house. Ryan's lifestyle includes hanging in front of the TV, attending a community college, and partying all the time with his best friend and neighbor, Anthony. The problem is that Ryan doesn't look for trouble-it has a habit of finding him.
Predicted Genre: comedy
