In [4]:

import os
os.listdir()
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
data=pd.read_csv("/description.txt")
data

def load_data(file_path):
  with open(file_path, 'r', encoding='utf-8') as f:
    data = f.readlines()
  data = [line.strip().split(" ::: ") for line in data]
  return data

train_data = load_data("/train_data.txt")
train_df = pd.DataFrame(train_data, columns=['ID', 'Title', 'Genre', 'Description'])

test_data = load_data("/test_data.txt")
test_df = pd.DataFrame(test_data, columns=['ID', 'Title', 'Description'])

test_solution = load_data("/test_data_solution.txt")
test_solution_df = pd.DataFrame(test_solution, columns=['ID', 'Title', 'Genre', 'Description'])

print("Train Data:")
train_df
print("\nTest Data:")
test_data
print("\nTest Solution:")

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=10000)

X_train_tfidf = vectorizer.fit_transform(train_df ["Description"])
X_test_tfidf = vectorizer.transform(test_df ["Description"])

print(f"Training data shape: {X_train_tfidf.shape}")
print(f"Test data shape: {X_test_tfidf.shape}")

from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df['Genre'])
print(f"Unique genres in training data: {label_encoder.classes_}")

from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(max_iter=200)
lr_model.fit(X_train_tfidf, y_train)

y_pred = lr_model.predict(X_test_tfidf)
predicted_genres = label_encoder.inverse_transform(y_pred)

test_df['Predicted_Genre'] = predicted_genres
test_df[['Title', 'Predicted_Genre']]

test_df['Predicted_Genre'] = predicted_genres
merged_df = pd.merge(test_solution_df[['ID', 'Genre']], test_df[['ID', 'Predicted_Genre']], on='ID')
merged_df

from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(merged_df['Genre'], merged_df['Predicted_Genre'])
print(f"Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(merged_df['Genre'], merged_df['Predicted_Genre']))

from sklearn.naive_bayes import MultinomialNB
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

y_pred_nb = nb_model.predict(X_test_tfidf)
predicted_genres_nb = label_encoder.inverse_transform(y_pred_nb)
test_df['Predicted_Genre_NB'] = predicted_genres_nb
merged_df_nb = pd.merge(test_solution_df, test_df[['ID', 'Predicted_Genre_NB']], on='ID')

from sklearn.metrics import  accuracy_score, classification_report

accuracy_nb = accuracy_score(merged_df_nb['Genre'], merged_df_nb['Predicted_Genre_NB'])
print(f"Naive Bayes Accuracy: {accuracy_nb}")

print("Naive Bayes Classification Report:")
print(classification_report(merged_df_nb['Genre'], merged_df_nb['Predicted_Genre_NB'], target_names=label_encoder.classes_))

from sklearn.svm import SVC
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_tfidf, y_train)

y_pred_svm = svm_model.predict(X_test_tfidf)
predicted_genres_svm = label_encoder.inverse_transform(y_pred_svm)
test_df['Predicted_Genre_SVM'] = predicted_genres_svm
merged_df_svm = pd.merge(test_solution_df, test_df[['ID', 'Predicted_Genre_SVM']], on='ID')

from sklearn.metrics import accuracy_score, classification_report

accuracy_svm = accuracy_score(merged_df_svm['Genre'], merged_df_svm['Predicted_Genre_SVM'])
print(f"SVM Accuracy: {accuracy_svm}")

print("SVM Classification Report")
print(classification_report(merged_df_svm['Genre'], merged_df_svm['Predicted_Genre_SVM'], target_names=label_encoder.classes_))

zoner_Description = [
    'Explosive fight scenes in the city streets',
    'A haunted mansion that trap its visitors',
    'A brave adventurer is search of lost treasure',
    'A forbidden romance in the 1920s',
    'A daring rescue mission with a love interest'
]

test_data_tfidf = vectorizer.transform(zoner_Description)

y_pred_lr = lr_model.predict(test_data_tfidf)
predicted_genres_lr = label_encoder.inverse_transform(y_pred_lr)

y_pred_nb = nb_model.predict(test_data_tfidf)
predicted_genres_nb = label_encoder.inverse_transform(y_pred_nb)

y_pred_svm = svm_model.predict(test_data_tfidf)
predicted_genres_svm = label_encoder.inverse_transform(y_pred_svm)

print("Predicted Genres using Logistic Regression : ", predicted_genres_lr)
print("Predicted Genres using Naive Bayes         : ", predicted_genres_nb)
print("Predicted Genres using SVM                 : ", predicted_genres_svm)
print()
for i, message in enumerate(zoner_Description):
  print(f"Story : {message}")
  print(f"Status :\tNaive Bayes Prediction   : {predicted_genres_nb[i]}")
  print(f"\t\tLogistic Regression Prediction : {predicted_genres_lr[i]}")
  print(f"\t\tSVM Prediction                 : {predicted_genres_svm[i]}")
  print("="*100)





Train Data:

Test Data:

Test Solution:
Training data shape: (54214, 10000)
Test data shape: (54200, 10000)
Unique genres in training data: ['action' 'adult' 'adventure' 'animation' 'biography' 'comedy' 'crime'
 'documentary' 'drama' 'family' 'fantasy' 'game-show' 'history' 'horror'
 'music' 'musical' 'mystery' 'news' 'reality-tv' 'romance' 'sci-fi'
 'short' 'sport' 'talk-show' 'thriller' 'war' 'western']
Accuracy: 0.5945

Classification Report:
              precision    recall  f1-score   support

      action       0.51      0.30      0.37      1314
       adult       0.65      0.24      0.35       590
   adventure       0.67      0.16      0.25       775
   animation       0.61      0.04      0.08       498
   biography       0.00      0.00      0.00       264
      comedy       0.54      0.60      0.57      7446
       crime       0.41      0.03      0.06       505
 documentary       0.68      0.87      0.76     13096
       drama       0.55      0.78      0.65     13612
      fam