In [1]:
# Bag of Words (BOW) + Logistic Regression (LR) for Movie Genre Prediction (MultiLabel Text Classification)
# Data: - We are using IMDB Movies Analysis (kaggle.com)

In [2]:
# Importing python libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import MultiOutputClassifier

# Data Collection - Our data is in a CSV file named 'movies_initial.csv'
data = pd.read_csv('movies_initial.csv')

# Feature Selection - Selecting columns genre and fullplot
data = data[['genre','fullplot']]

# Data Cleaning - Drop null values from genre and fullplot
data = data.dropna(subset=['genre', 'fullplot'])

print(data)

                          genre  \
0            Documentary, Short   
1                         Short   
2      Animation, Comedy, Short   
3            Documentary, Short   
4            Documentary, Short   
...                         ...   
46004                     Drama   
46006               Documentary   
46007                    Horror   
46008  Comedy, Fantasy, Romance   
46010                    Sci-Fi   

                                                fullplot  
0      Performing on what looks like a small wooden s...  
1      A stationary camera looks at a large anvil wit...  
2      One night, Arlequin come to see his lover Colo...  
3      A man (Edison's assistant) takes a pinch of sn...  
4      A man opens the big gates to the Lumi�re facto...  
...                                                  ...  
46004  A post modern theater adaptation of a classic ...  
46006  Musician Jonny Greenwood travels to Rajasthan,...  
46007  A cash strapped student who starts workin

In [3]:

# Split the data into training, dev, and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=111)

train_data,dev_data = train_test_split(train_data, test_size=0.1, random_state=111)

# Extracting features using Bag of Words
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_data['fullplot'])
X_dev = vectorizer.transform(dev_data['fullplot'])
X_test = vectorizer.transform(test_data['fullplot'])

# Convert genre labels into a binary matrix for multi-label classification
y_train = train_data['genre'].apply(lambda x: x.split(', '))
y_dev = dev_data['genre'].apply(lambda x: x.split(', '))
y_test = test_data['genre'].apply(lambda x: x.split(', '))


mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(y_train)
y_dev = mlb.transform(y_dev)
y_test = mlb.transform(y_test)

# print("X_train : \n",X_train[:5])
# print("X_test: \n",X_test[:5])
# print("y_train: \n",y_train[:5])
# print("y_test: \n",y_test[:5])
print(train_data,dev_data,test_data)

                           genre  \
19237                   Thriller   
9466                       Drama   
41672                   Thriller   
35707             Comedy, Family   
2660      Drama, Romance, Comedy   
...                          ...   
30807  Horror, Mystery, Thriller   
16825                     Comedy   
19354    Horror, Mystery, Sci-Fi   
27418     Comedy, Drama, Romance   
41530       Action, Crime, Drama   

                                                fullplot  
19237  Sydney Banks learns the hard way that being dr...  
9466   Peter Falk is a blue collar man trying to deal...  
41672  A political thriller during the Wounded Knee i...  
35707  Third grader Judy Moody sets out to have the m...  
2660      Sea-going roustabout falls for meek librarian.  
...                                                  ...  
30807  A young woman, Marnie Watson, is granted early...  
16825  An ancient Greek philosopher's comical vision ...  
19354  On the Harrington High School

In [4]:
# Initialize and train the MultiOutputClassifier with Logistic Regression as the base estimator
model = MultiOutputClassifier(LogisticRegression(max_iter=10000))
model.fit(X_train, y_train)

# Predict the labels for the dev set
y_pred = model.predict(X_dev)

# Evaluate the performance
accuracy = accuracy_score(y_dev, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Display detailed classification report
print("\nClassification Report for Dev:")
print(classification_report(y_dev, y_pred, target_names=mlb.classes_))

# Predict the labels for the test set
y_pred = model.predict(X_test)

# Evaluate the performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Display detailed classification report
print("\nClassification Report for Test:")
print(classification_report(y_test, y_pred, target_names=mlb.classes_))

Accuracy: 0.19

Classification Report for Dev:
              precision    recall  f1-score   support

      Action       0.72      0.48      0.58       468
       Adult       0.00      0.00      0.00         1
   Adventure       0.57      0.35      0.43       306
   Animation       0.63      0.35      0.45       133
   Biography       0.56      0.26      0.36       153
      Comedy       0.64      0.57      0.61      1013
       Crime       0.62      0.43      0.51       421
 Documentary       0.81      0.59      0.68       279
       Drama       0.72      0.69      0.71      1690
      Family       0.48      0.26      0.34       182
     Fantasy       0.45      0.21      0.29       170
   Film-Noir       0.25      0.03      0.06        32
     History       0.39      0.14      0.21       126
      Horror       0.76      0.51      0.61       357
       Music       0.58      0.33      0.42        98
     Musical       0.46      0.13      0.20        99
     Mystery       0.50      0.31 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
