In [None]:
# Install the Kaggle library
!pip install kaggle

# Upload your Kaggle API token (kaggle.json file)
from google.colab import files
files.upload()

# Move the kaggle.json file to the .kaggle folder
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download the dataset from Kaggle
!kaggle datasets download -d hijest/genre-classification-dataset-imdb

# Unzip the dataset
!unzip genre-classification-dataset-imdb.zip


In [None]:
import pandas as pd

# Load the training dataset
df_train = pd.read_csv('/content/Genre Classification Dataset/train_data.txt', delimiter=':::', engine='python', names=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'])

# Load the test dataset (for later evaluation)
df_test = pd.read_csv('/content/Genre Classification Dataset/test_data.txt', delimiter=':::', engine='python', names=['ID', 'TITLE', 'DESCRIPTION'])

# Check the structure of the training dataset
df_train.head()


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Split the dataset into features (X) and labels (y)
X = df_train['DESCRIPTION']
y = df_train['GENRE']

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the TF-IDF vectorizer (convert text to numeric form)
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# Fit and transform the training data, transform validation data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)


In [5]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Train the Naive Bayes classifier
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Make predictions on the validation set
y_pred_nb = nb_model.predict(X_val_tfidf)

# Evaluate the model
print("Naive Bayes Accuracy:", accuracy_score(y_val, y_pred_nb))
print("Classification Report:\n", classification_report(y_val, y_pred_nb))


Naive Bayes Accuracy: 0.5231946878170248
Classification Report:
                precision    recall  f1-score   support

      action        0.58      0.08      0.14       263
       adult        0.88      0.06      0.12       112
   adventure        0.29      0.03      0.05       139
   animation        0.00      0.00      0.00       104
   biography        0.00      0.00      0.00        61
      comedy        0.51      0.44      0.47      1443
       crime        0.00      0.00      0.00       107
 documentary        0.58      0.88      0.70      2659
       drama        0.46      0.83      0.59      2697
      family        1.00      0.01      0.01       150
     fantasy        0.00      0.00      0.00        74
   game-show        1.00      0.15      0.26        40
     history        0.00      0.00      0.00        45
      horror        0.73      0.36      0.48       431
       music        0.77      0.12      0.20       144
     musical        0.00      0.00      0.00        50

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [6]:
from sklearn.linear_model import LogisticRegression

# Train the Logistic Regression classifier
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)

# Make predictions on the validation set
y_pred_lr = lr_model.predict(X_val_tfidf)

# Evaluate the model
print("Logistic Regression Accuracy:", accuracy_score(y_val, y_pred_lr))
print("Classification Report:\n", classification_report(y_val, y_pred_lr))


Logistic Regression Accuracy: 0.5794521811306834
Classification Report:
                precision    recall  f1-score   support

      action        0.53      0.27      0.35       263
       adult        0.71      0.21      0.33       112
   adventure        0.42      0.14      0.21       139
   animation        0.61      0.11      0.18       104
   biography        0.00      0.00      0.00        61
      comedy        0.51      0.58      0.55      1443
       crime        0.43      0.03      0.05       107
 documentary        0.66      0.85      0.74      2659
       drama        0.54      0.78      0.64      2697
      family        0.41      0.07      0.12       150
     fantasy        0.00      0.00      0.00        74
   game-show        0.94      0.42      0.59        40
     history        0.00      0.00      0.00        45
      horror        0.63      0.56      0.59       431
       music        0.63      0.47      0.54       144
     musical        1.00      0.02      0.04  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [9]:
from sklearn.svm import SVC

# Train the SVM classifier
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_tfidf, y_train)

# Make predictions on the validation set
y_pred_svm = svm_model.predict(X_val_tfidf)

# Evaluate the model
print("SVM Accuracy:", accuracy_score(y_val, y_pred_svm))
print("Classification Report:\n", classification_report(y_val, y_pred_svm))


SVM Accuracy: 0.5853546066586738
Classification Report:
                precision    recall  f1-score   support

      action        0.49      0.36      0.41       263
       adult        0.67      0.39      0.49       112
   adventure        0.44      0.22      0.29       139
   animation        0.54      0.14      0.23       104
   biography        0.00      0.00      0.00        61
      comedy        0.52      0.60      0.56      1443
       crime        0.17      0.02      0.03       107
 documentary        0.68      0.83      0.74      2659
       drama        0.55      0.75      0.64      2697
      family        0.28      0.07      0.12       150
     fantasy        0.17      0.01      0.03        74
   game-show        0.92      0.55      0.69        40
     history        0.00      0.00      0.00        45
      horror        0.64      0.61      0.62       431
       music        0.63      0.53      0.58       144
     musical        0.60      0.06      0.11        50
     my

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [10]:
def predict_genre(user_input):
    # Transform the user's input with the TF-IDF vectorizer
    user_input_tfidf = tfidf_vectorizer.transform([user_input])

    # Predictions from the models
    nb_prediction = nb_model.predict(user_input_tfidf)
    lr_prediction = lr_model.predict(user_input_tfidf)
    svm_prediction = svm_model.predict(user_input_tfidf)

    # Output predictions from all models
    print(f"Naive Bayes Prediction: {nb_prediction[0]}")
    print(f"Logistic Regression Prediction: {lr_prediction[0]}")
    print(f"SVM Prediction: {svm_prediction[0]}")

# Example of calling the function
movie_description = input("Enter a movie description: ")
predict_genre(movie_description)


Enter a movie description: A superhero battles evil forces to save the world.
Naive Bayes Prediction:  drama 
Logistic Regression Prediction:  horror 
SVM Prediction:  fantasy 
