# MOVIE GENRE

In [None]:
!python -m pip install --upgrade pip


In [None]:
!pip install nltk
!pip install string
!pip install re

In [None]:
import nltk
import string
import re

import pandas as pd
# import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import LancasterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, classification_report

# Data Collection and Processing
IMDb began as a movie database on the Usenet group "rec.arts.movies" in 1990 and moved to the web in 1993.

No. of Attributes: 4
No. of Instances: 54214

In [None]:
# Training Data
train_data = pd.read_csv('/kaggle/input/genre-classification-dataset-imdb/Genre Classification Dataset/train_data.txt', sep=':::', names=['Movie_title', 'Genre', 'Description'], engine='python') 

In [None]:
train_data.describe()

In [None]:
train_data.shape

In [None]:
train_data.info()

In [None]:
train_data.head(10)

In [None]:
train_data.isnull().sum()

In [None]:
# Test data

test_path = "/kaggle/input/genre-classification-dataset-imdb/Genre Classification Dataset/test_data.txt"
test_data = pd.read_csv(test_path, sep=':::', names=['Id', 'Movie_title', 'Description'], engine='python')
test_data.head()

# Exploratory Data Analysis (EDA) & Data Visualization

In [None]:
# distribution of genres in training data
plt.figure(figsize=(10,10))
sns.countplot(data=train_data,order= train_data["Genre"].value_counts().index,y="Genre")
plt.title("Genre Distribution")

# Data Preprocessing and Text Cleaning

In [None]:
stemmer = LancasterStemmer()
stop_words=stopwords.words("english")

# Define the clean_text function
def clean_text(text):
    text = text.lower()  # Lowercase all characters
    text = re.sub(r'@\S+', '', text)  # Remove Twitter handles and email addresses
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'pic.\S+', '', text)
    text = re.sub(r"[^a-zA-Z+']", ' ', text)  # Remove special characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text + ' ')  # Keep words with length > 1 only
    text = "".join([i for i in text if i not in string.punctuation])
    words = nltk.word_tokenize(text) #tokenize text
    stopwords = nltk.corpus.stopwords.words('english')  # Remove stopwords
    text = " ".join([i for i in words if i not in stopwords and len(i) > 2])
    text = re.sub("\s[\s]+", " ", text).strip()  # Remove multiple spaces
    return text

# Apply the clean_text function to the 'Description' column in the training and test data
train_data['New_text'] = train_data['Description'].apply(clean_text)
test_data['New_text'] = test_data['Description'].apply(clean_text)

In [None]:
test_data.head()

In [None]:
tfidf = TfidfVectorizer()
X_train = tfidf.fit_transform(train_data['New_text'])
X_test = tfidf.transform(test_data['New_text'])

In [None]:
# training and validation sets
X = X_train
y = train_data['Genre']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

# Model Training & Evaluation

In [None]:
nb = MultinomialNB()
nb.fit(X_train, y_train)

In [None]:
y_pred = nb.predict(X_val) 

In [None]:
accuracy = accuracy_score(y_val, y_pred)
print("Validation Accuracy:", accuracy)

In [None]:
print(classification_report(y_val, y_pred))

# Predictions on Test Dataset

In [None]:
# Use the trained model to make predictions on the test data
X_test_predictions = nb.predict(X_test)
test_data['Predicted_Genre'] = X_test_predictions

In [None]:
# Save the test_data DataFrame with predicted genres to a CSV file
test_data.to_csv('predicted_genres.csv', index=False)

# Display the 'test_data' DataFrame with predicted genres
print(test_data)