Create a machine learning model that can predict the genre of a
movie based on its plot summary or other textual information. You
can use techniques like TF-IDF or word embeddings with classifiers such as Naive Bayes, Logistic Regression, or Support Vector
Machines.

In [28]:
import os 
os.listdir()

['Genre Classification Dataset', 'Model.ipynb', 'Model2.ipynb', '__pycache__']

In [29]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
data=pd.read_csv("Genre Classification Dataset/description.txt")
data

Unnamed: 0,Train data:
0,ID ::: TITLE ::: GENRE ::: DESCRIPTION
1,ID ::: TITLE ::: GENRE ::: DESCRIPTION
2,ID ::: TITLE ::: GENRE ::: DESCRIPTION
3,ID ::: TITLE ::: GENRE ::: DESCRIPTION
4,Test data:
5,ID ::: TITLE ::: DESCRIPTION
6,ID ::: TITLE ::: DESCRIPTION
7,ID ::: TITLE ::: DESCRIPTION
8,ID ::: TITLE ::: DESCRIPTION
9,Source:


In [30]:
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = f.readlines()
    data = [line.strip().split(' :::') for line in data]
    return data

In [31]:
train_data = load_data("Genre Classification Dataset/train_data.txt")
train_df = pd.DataFrame(train_data, columns=['ID', 'Titles', 'Genres', 'Description'])

test_data = load_data("Genre Classification Dataset/test_data.txt")
test_df = pd.DataFrame(test_data, columns=['ID', 'Titles', 'Description'])

test_solution = load_data("Genre Classification Dataset/test_data_solution.txt")
test_solution_df = pd.DataFrame(test_solution, columns=['ID', 'Title' ,'Genres', 'Description'])

In [32]:
print("Train Data:")
train_df 

Train Data:


Unnamed: 0,ID,Titles,Genres,Description
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...
...,...,...,...,...
54209,54210,"""Bonino"" (1953)",comedy,This short-lived NBC live sitcom centered on ...
54210,54211,Dead Girls Don't Cry (????),horror,The NEXT Generation of EXPLOITATION. The sist...
54211,54212,Ronald Goedemondt: Ze bestaan echt (2008),documentary,"Ze bestaan echt, is a stand-up comedy about g..."
54212,54213,Make Your Own Bed (1944),comedy,Walter and Vivian live in the country and hav...


In [33]:
print("Test Data:")
test_df

Test Data:


Unnamed: 0,ID,Titles,Description
0,1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apar..."
1,2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty ch..."
2,3,Off the Beaten Track (2010),One year in the life of Albin and his family ...
3,4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with hi..."
4,5,Er nu zhai (1955),Before he was known internationally as a mart...
...,...,...,...
54195,54196,"""Tales of Light & Dark"" (2013)","Covering multiple genres, Tales of Light & Da..."
54196,54197,Der letzte Mohikaner (1965),As Alice and Cora Munro attempt to find their...
54197,54198,Oliver Twink (2007),A movie 169 years in the making. Oliver Twist...
54198,54199,Slipstream (1973),"Popular, but mysterious rock D.J Mike Mallard..."


In [34]:
print("Test Solution:")
test_solution_df

Test Solution:


Unnamed: 0,ID,Title,Genres,Description
0,1,Edgar's Lunch (1998),thriller,"L.R. Brane loves his life - his car, his apar..."
1,2,La guerra de papá (1977),comedy,"Spain, March 1964: Quico is a very naughty ch..."
2,3,Off the Beaten Track (2010),documentary,One year in the life of Albin and his family ...
3,4,Meu Amigo Hindu (2015),drama,"His father has died, he hasn't spoken with hi..."
4,5,Er nu zhai (1955),drama,Before he was known internationally as a mart...
...,...,...,...,...
54195,54196,"""Tales of Light & Dark"" (2013)",horror,"Covering multiple genres, Tales of Light & Da..."
54196,54197,Der letzte Mohikaner (1965),western,As Alice and Cora Munro attempt to find their...
54197,54198,Oliver Twink (2007),adult,A movie 169 years in the making. Oliver Twist...
54198,54199,Slipstream (1973),drama,"Popular, but mysterious rock D.J Mike Mallard..."


In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=10000)

X_train = vectorizer.fit_transform(train_df['Description'])
X_test = vectorizer.transform(test_df['Description'])

print('Training data shape:', X_train.shape)
print('Test data shape:', X_test.shape)

Training data shape: (54214, 10000)
Test data shape: (54200, 10000)


In [36]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df['Genres'])
print(f"Unique genre in the training data : {label_encoder.classes_}")

Unique genre in the training data : [' action' ' adult' ' adventure' ' animation' ' biography' ' comedy'
 ' crime' ' documentary' ' drama' ' family' ' fantasy' ' game-show'
 ' history' ' horror' ' music' ' musical' ' mystery' ' news' ' reality-tv'
 ' romance' ' sci-fi' ' short' ' sport' ' talk-show' ' thriller' ' war'
 ' western']


In [37]:
from sklearn.linear_model import LogisticRegression

# Use the 'liblinear' solver for small datasets or binary classification problems
lr_model = LogisticRegression(max_iter=200, solver='liblinear')
lr_model.fit(X_train, y_train)

y_pred = lr_model.predict(X_test)
y_pred_genre = label_encoder.inverse_transform(y_pred)

test_df['Predicted_Genres'] = y_pred_genre
test_df[['Titles', 'Predicted_Genres']]

Unnamed: 0,Titles,Predicted_Genres
0,Edgar's Lunch (1998),drama
1,La guerra de papá (1977),drama
2,Off the Beaten Track (2010),documentary
3,Meu Amigo Hindu (2015),drama
4,Er nu zhai (1955),drama
...,...,...
54195,"""Tales of Light & Dark"" (2013)",drama
54196,Der letzte Mohikaner (1965),drama
54197,Oliver Twink (2007),comedy
54198,Slipstream (1973),drama


In [38]:
test_df['Predicted_Genres'] = y_pred_genre

mergeed_df = pd.merge(test_solution_df[['ID', 'Genres']], test_df[['ID', 'Predicted_Genres']], on='ID')
mergeed_df

Unnamed: 0,ID,Genres,Predicted_Genres
0,1,thriller,drama
1,2,comedy,drama
2,3,documentary,documentary
3,4,drama,drama
4,5,drama,drama
...,...,...,...
54195,54196,horror,drama
54196,54197,western,drama
54197,54198,adult,comedy
54198,54199,drama,drama


In [39]:
from sklearn.metrics import accuracy_score, classification_report

accuracy_score = accuracy_score(mergeed_df['Genres'], mergeed_df['Predicted_Genres'])
print(f"Accuracy: {accuracy_score}")

print("\nClassification Report:")
print(classification_report(mergeed_df['Genres'], mergeed_df['Predicted_Genres']))

Accuracy: 0.5896494464944649

Classification Report:
              precision    recall  f1-score   support

      action       0.52      0.25      0.34      1314
       adult       0.64      0.21      0.32       590
   adventure       0.73      0.15      0.25       775
   animation       0.58      0.03      0.05       498
   biography       0.00      0.00      0.00       264
      comedy       0.54      0.59      0.56      7446
       crime       0.47      0.01      0.03       505
 documentary       0.66      0.88      0.76     13096
       drama       0.54      0.80      0.64     13612
      family       0.48      0.05      0.10       783
     fantasy       0.64      0.02      0.04       322
   game-show       0.91      0.48      0.63       193
     history       0.00      0.00      0.00       243
      horror       0.67      0.55      0.60      2204
       music       0.70      0.40      0.51       731
     musical       1.00      0.00      0.01       276
     mystery       1.00     

In [40]:
from sklearn.naive_bayes import MultinomialNB
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

In [41]:
# Predict using X_test
y_pred_nb = nb_model.predict(X_test)

# Decode the predicted labels
y_pred_genre_nb = label_encoder.inverse_transform(y_pred_nb)

# Assign predictions to the test_df
test_df['Predicted_Genres_NB'] = y_pred_genre_nb

# Merge the dataframes on 'ID'
merged_df_nb = pd.merge(
    test_solution_df[['ID', 'Genres']],
    test_df[['ID', 'Predicted_Genres_NB']],
    on='ID'
)

# Display or process the merged dataframe
print(merged_df_nb.head())


  ID        Genres Predicted_Genres_NB
0  1      thriller               drama
1  2        comedy               drama
2  3   documentary         documentary
3  4         drama               drama
4  5         drama               drama


In [42]:
from sklearn.metrics import accuracy_score, classification_report

# Ensure the correct variable name is used
accuracy_score_nb = accuracy_score(merged_df_nb['Genres'], merged_df_nb['Predicted_Genres_NB'])
print(f"Accuracy: {accuracy_score_nb}")

print("Naive Bayes Classification Report:")
print(classification_report(merged_df_nb['Genres'], merged_df_nb['Predicted_Genres_NB'], target_names=label_encoder.classes_))


Accuracy: 0.5092619926199262
Naive Bayes Classification Report:
              precision    recall  f1-score   support

      action       0.56      0.03      0.06      1314
       adult       0.46      0.02      0.04       590
   adventure       0.77      0.04      0.08       775
   animation       0.00      0.00      0.00       498
   biography       0.00      0.00      0.00       264
      comedy       0.53      0.40      0.46      7446
       crime       0.00      0.00      0.00       505
 documentary       0.56      0.89      0.69     13096
       drama       0.44      0.84      0.58     13612
      family       0.00      0.00      0.00       783
     fantasy       0.00      0.00      0.00       322
   game-show       1.00      0.02      0.04       193
     history       0.00      0.00      0.00       243
      horror       0.77      0.23      0.35      2204
       music       0.89      0.02      0.05       731
     musical       0.00      0.00      0.00       276
     mystery     

In [43]:
# Import required libraries
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.datasets import make_classification

# Generate a synthetic dataset (replace this with your actual dataset)
X, y = make_classification(n_samples=10000, n_features=50, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Convert test data to a DataFrame for adding predictions later
test_df = pd.DataFrame(X_test, columns=[f'feature_{i}' for i in range(X_test.shape[1])])

# Create a pipeline for preprocessing and model training
pipeline = Pipeline([
    ('scaler', StandardScaler()),       # Step 1: Standardize the features
    ('pca', PCA(n_components=20)),      # Step 2: Reduce dimensionality to 20 components
    ('svm', LinearSVC(random_state=42)) # Step 3: Train a linear SVM model
])

# Train the SVM model
pipeline.fit(X_train, y_train)

# Make predictions on the test data
y_pred = pipeline.predict(X_test)

# Apply inverse transformation to predictions if needed (for scaled data)
# Since SVM predictions are class labels, no inverse transform is required. If you were predicting continuous values, you'd use scaler.inverse_transform here.

# Add predictions to the test DataFrame
test_df['Predicted'] = y_pred
test_df['Actual'] = y_test

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Display the first few rows of the test DataFrame with predictions
print("\nTest DataFrame with Predictions:")
print(test_df.head())


Model accuracy: 88.47%

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.90      0.89      1512
           1       0.89      0.87      0.88      1488

    accuracy                           0.88      3000
   macro avg       0.88      0.88      0.88      3000
weighted avg       0.88      0.88      0.88      3000


Test DataFrame with Predictions:
   feature_0  feature_1  feature_2  feature_3  feature_4  feature_5  \
0  -0.916370   0.482381  -0.142257  -0.048758   0.699252  -0.100785   
1  -1.956234   0.014410  -0.682846   0.433892   0.795823  -0.898909   
2  -1.328933   1.093392  -0.030002  -0.732371  -1.007102  -0.636887   
3  -1.154228  -0.377765  -0.104047   0.868201  -0.484904   0.051578   
4  -0.657078   0.421698   0.457115  -0.595041   1.146915  -0.546578   

   feature_6  feature_7  feature_8  feature_9  ...  feature_42  feature_43  \
0  -1.007662  -1.577179  -0.133816   0.177513  ...    0.950591    0.715035   
1  -0.683

In [46]:
import unittest
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import numpy as np
from sklearn.model_selection import train_test_split

# Dummy data generation
np.random.seed(42)
X = np.random.rand(100, 10000)  # 100 samples, 10,000 features
y = np.random.randint(0, 2, 100)  # Binary target

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train models
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(kernel='linear', probability=True))
])
pipeline.fit(X_train, y_train)


# Unit tests
class TestModels(unittest.TestCase):

    def test_logistic_regression(self):
        # Test Logistic Regression model
        y_pred = lr_model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        self.assertGreater(accuracy, 0.5, "Logistic Regression accuracy is too low")

    def test_naive_bayes(self):
        # Test Naive Bayes model
        y_pred_nb = nb_model.predict(X_test)
        accuracy_nb = accuracy_score(y_test, y_pred_nb)
        self.assertGreater(accuracy_nb, 0.5, "Naive Bayes accuracy is too low")

    def test_svm(self):
        # Test SVM model
        y_pred_svm = pipeline.predict(X_test)
        accuracy_svm = accuracy_score(y_test, y_pred_svm)
        self.assertGreater(accuracy_svm, 0.5, "SVM accuracy is too low")


if __name__ == '__main__':
    unittest.main(argv=[''], exit=False)


FFF
FAIL: test_logistic_regression (__main__.TestModels.test_logistic_regression)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "C:\Users\SHAH DHRUV\AppData\Local\Temp\ipykernel_7700\3886810001.py", line 40, in test_logistic_regression
    self.assertGreater(accuracy, 0.5, "Logistic Regression accuracy is too low")
    ~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError: 0.45 not greater than 0.5 : Logistic Regression accuracy is too low

FAIL: test_naive_bayes (__main__.TestModels.test_naive_bayes)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "C:\Users\SHAH DHRUV\AppData\Local\Temp\ipykernel_7700\3886810001.py", line 46, in test_naive_bayes
    self.assertGreater(accuracy_nb, 0.5, "Naive Bayes accuracy is too low")
    ~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError: 0.5 not