In [1]:
import pandas as pd
import nltk
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report, accuracy_score

**DATASET**

In [2]:
def load_data(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:  
        for line in file:
            parts = line.strip().split(' ::: ')
            data.append(parts)
    return data

train_data = load_data('train_data.txt')
test_data = load_data('test_data.txt')

train_df = pd.DataFrame(train_data, columns=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'])
test_df = pd.DataFrame(test_data, columns=['ID', 'TITLE', 'DESCRIPTION'])


In [3]:
train_df

Unnamed: 0,ID,TITLE,GENRE,DESCRIPTION
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his doc...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous re...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fiel...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends meet...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-rec...
...,...,...,...,...
54209,54210,"""Bonino"" (1953)",comedy,This short-lived NBC live sitcom centered on B...
54210,54211,Dead Girls Don't Cry (????),horror,The NEXT Generation of EXPLOITATION. The siste...
54211,54212,Ronald Goedemondt: Ze bestaan echt (2008),documentary,"Ze bestaan echt, is a stand-up comedy about gr..."
54212,54213,Make Your Own Bed (1944),comedy,Walter and Vivian live in the country and have...


**PREPROCESSING**

In [4]:
nltk.download('punkt')

# Clean text
def clean_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    text = [word for word in text if word.isalpha()]
    return " ".join(text)

train_df['clean_description'] = train_df['DESCRIPTION'].apply(clean_text)
test_df['clean_description'] = test_df['DESCRIPTION'].apply(clean_text)

# Encode genres
mlb = MultiLabelBinarizer()
train_df['GENRE'] = train_df['GENRE'].apply(lambda x: x.split('|'))
y_train = mlb.fit_transform(train_df['GENRE'])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\prath\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


**FEATURE EXTRACTION**

In [5]:
tfidf = TfidfVectorizer(max_features=5000)
X_train = tfidf.fit_transform(train_df['clean_description'])
X_test = tfidf.transform(test_df['clean_description'])

In [6]:
print(X_train)

  (0, 490)	0.1403776056840205
  (0, 4374)	0.15598773758116588
  (0, 744)	0.11520399741596749
  (0, 3373)	0.13924142542631313
  (0, 1838)	0.08349781824443517
  (0, 3098)	0.02660054763728637
  (0, 889)	0.10980504512696239
  (0, 1850)	0.11215488707796836
  (0, 2702)	0.07233637462768372
  (0, 2634)	0.05776587607896355
  (0, 147)	0.1441610450575966
  (0, 728)	0.1610409452005096
  (0, 4930)	0.17106368004051503
  (0, 2228)	0.14549123234929123
  (0, 3466)	0.12416015513794433
  (0, 1595)	0.11873751804425668
  (0, 2090)	0.10536503712178738
  (0, 4737)	0.12625362710167598
  (0, 234)	0.15381966702980837
  (0, 762)	0.13757709299914903
  (0, 273)	0.09095939418512568
  (0, 2162)	0.12314889195528972
  (0, 3119)	0.04260225352875185
  (0, 2834)	0.09538799635037257
  (0, 3318)	0.17645537317345292
  :	:
  (54213, 3009)	0.1964837748476788
  (54213, 1552)	0.15108494104893466
  (54213, 1557)	0.1505292985120582
  (54213, 4229)	0.2022046707870239
  (54213, 2397)	0.12529450466729458
  (54213, 2126)	0.1861890794

**MODEL TRAINING**

In [7]:
# Naive Bayes
nb = MultiOutputClassifier(MultinomialNB())
nb.fit(X_train, y_train)

In [8]:
# Logistic Regression
lr = MultiOutputClassifier(LogisticRegression())
lr.fit(X_train, y_train)

In [9]:
# Naive Bayes
y_pred_nb = nb.predict(X_train)
print("Naive Bayes Classification Report:")
print(classification_report(y_train, y_pred_nb))
print("Accuracy:", accuracy_score(y_train, y_pred_nb))

# Logistic Regression
y_pred_lr = lr.predict(X_train)
print("Logistic Regression Classification Report:")
print(classification_report(y_train, y_pred_lr))
print("Accuracy:", accuracy_score(y_train, y_pred_lr))

Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.00      0.00      1315
           1       0.20      0.00      0.00       590
           2       0.80      0.01      0.01       775
           3       0.00      0.00      0.00       498
           4       0.00      0.00      0.00       265
           5       0.72      0.07      0.13      7447
           6       0.00      0.00      0.00       505
           7       0.74      0.65      0.69     13096
           8       0.68      0.39      0.49     13613
           9       0.00      0.00      0.00       784
          10       0.00      0.00      0.00       323
          11       1.00      0.03      0.06       194
          12       0.00      0.00      0.00       243
          13       0.92      0.05      0.09      2204
          14       0.86      0.08      0.14       731
          15       0.00      0.00      0.00       277
          16       0.00      0.00      0.00   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.2741911683329029
Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.08      0.14      1315
           1       0.80      0.11      0.19       590
           2       0.87      0.06      0.12       775
           3       0.88      0.01      0.03       498
           4       0.00      0.00      0.00       265
           5       0.82      0.34      0.48      7447
           6       0.67      0.00      0.01       505
           7       0.84      0.73      0.78     13096
           8       0.75      0.51      0.61     13613
           9       1.00      0.02      0.04       784
          10       0.00      0.00      0.00       323
          11       0.96      0.34      0.50       194
          12       0.00      0.00      0.00       243
          13       0.89      0.35      0.51      2204
          14       0.85      0.30      0.45       731
          15       1.00      0.00      0.01       277
         

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [10]:
# Predict genres for test data using the best performing model
y_pred_test = lr.predict(X_test)  # Assuming Logistic Regression performed best

# Convert predictions back to genre labels
y_pred_test_labels = mlb.inverse_transform(y_pred_test)
test_df['PREDICTED_GENRE'] = y_pred_test_labels

# Save predictions to a CSV file
test_df[['ID', 'TITLE', 'PREDICTED_GENRE']].to_csv('test_predictions.csv', index=False)