In [1]:
import pandas as pd

# Load the dataset
train_data = pd.read_csv('/content/train_data.txt', sep=":::", names=["TITLE", "GENRE", "DESCRIPTION"], engine="python")

test_data = pd.read_csv('/content/test_data.txt', sep=":::", names=["TITLE", "GENRE", "DESCRIPTION"], engine="python")

print(train_data.head())


                                TITLE       GENRE  \
1       Oscar et la dame rose (2009)       drama    
2                       Cupid (1997)    thriller    
3   Young, Wild and Wonderful (1980)       adult    
4              The Secret Sin (1915)       drama    
5             The Unrecovered (2007)       drama    

                                         DESCRIPTION  
1   Listening in to a conversation between his do...  
2   A brother and sister with a past incestuous r...  
3   As the bus empties the students for their fie...  
4   To help their unemployed father make ends mee...  
5   The film's title refers not only to the un-re...  


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [9]:
"""Dropping missing values"""

# Drop rows with missing values
train_data.dropna()

"""Splitting the dataset"""

X_train, X_test, y_train, y_test = train_test_split(train_data['DESCRIPTION'], train_data['GENRE'], test_size=0.2, random_state=42)

In [10]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [11]:
"""Vectorization"""

tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [12]:
# Train a Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train_encoded)


In [13]:


"""Evaluation of model"""

y_pred = nb_classifier.predict(X_test_tfidf)

# Decode numerical predictions to original genre labels
y_pred_decoded = label_encoder.inverse_transform(y_pred)



In [14]:
"""Checking out accuracy"""

accuracy = accuracy_score(y_test, y_pred_decoded)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.46


In [15]:
"""Report

"""

# Display classification report
print(classification_report(y_test, y_pred_decoded))

               precision    recall  f1-score   support

      action        0.00      0.00      0.00        58
       adult        0.00      0.00      0.00        21
   adventure        0.00      0.00      0.00        31
   animation        0.00      0.00      0.00        13
   biography        0.00      0.00      0.00        10
      comedy        0.54      0.24      0.33       267
       crime        0.00      0.00      0.00        22
 documentary        0.54      0.85      0.66       477
       drama        0.39      0.86      0.54       472
      family        0.00      0.00      0.00        30
     fantasy        0.00      0.00      0.00        15
   game-show        0.00      0.00      0.00         5
     history        0.00      0.00      0.00         8
      horror        0.67      0.03      0.05        72
       music        0.00      0.00      0.00        16
     musical        0.00      0.00      0.00         7
     mystery        0.00      0.00      0.00        11
        n

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
