In [5]:
#Imports
import numpy as np
import pandas as pd

In [7]:
#Read data
def parse_movie_file(filepath):
    """
    Parses a text file with movie data in the format:
    <id> ::: <title> ::: <genre> ::: <description>
    
    Parameters:
        filepath (str): Path to the .txt file
    
    Returns:
        pd.DataFrame: DataFrame with columns ['ID', 'Title', 'Genre', 'Description']
    """
    data = []
    with open(filepath, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    for line in lines:
        parts = line.strip().split(" ::: ")
        if len(parts) == 4:
            movie_id, title, genre, description = parts
            data.append({
                "ID": int(movie_id),
                "Title": title,
                "Genre": genre,
                "Description": description
            })
    
    return pd.DataFrame(data)

In [9]:
def parse_movie_file_flexible(filepath):
    """
    Parses a text file with movie data, handling cases where genre may be missing.
    Format expected:
    <id> ::: <title> ::: <genre> ::: <description>
    OR
    <id> ::: <title> ::: <description>   (no genre)
    
    Returns a DataFrame with columns: ID, Title, Genre, Description
    Genre will be None if missing.
    """
    data = []
    with open(filepath, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    for line in lines:
        parts = line.strip().split(" ::: ")
        if len(parts) == 4:
            movie_id, title, genre, description = parts
        elif len(parts) == 3:
            movie_id, title, description = parts
            genre = None
        else:
            # skip malformed lines or handle as needed
            continue

        try:
            movie_id = int(movie_id)
        except ValueError:
            # Skip lines where ID is not an integer
            continue

        data.append({
            "ID": movie_id,
            "Title": title,
            "Genre": genre,
            "Description": description
        })

    return pd.DataFrame(data)


In [11]:
training_data = parse_movie_file('./data/Genre Classification Dataset/train_data.txt')
x_test = parse_movie_file_flexible('./data/Genre Classification Dataset/test_data.txt')

In [12]:
x_train = training_data.drop(['ID','Genre'],axis=1)

In [13]:
x_train

Unnamed: 0,Title,Description
0,Oscar et la dame rose (2009),Listening in to a conversation between his doc...
1,Cupid (1997),A brother and sister with a past incestuous re...
2,"Young, Wild and Wonderful (1980)",As the bus empties the students for their fiel...
3,The Secret Sin (1915),To help their unemployed father make ends meet...
4,The Unrecovered (2007),The film's title refers not only to the un-rec...
...,...,...
54209,"""Bonino"" (1953)",This short-lived NBC live sitcom centered on B...
54210,Dead Girls Don't Cry (????),The NEXT Generation of EXPLOITATION. The siste...
54211,Ronald Goedemondt: Ze bestaan echt (2008),"Ze bestaan echt, is a stand-up comedy about gr..."
54212,Make Your Own Bed (1944),Walter and Vivian live in the country and have...


In [132]:
y_train = training_data['Genre']

In [19]:
y_train

0              drama
1           thriller
2              adult
3              drama
4              drama
            ...     
54209         comedy
54210         horror
54211    documentary
54212         comedy
54213        history
Name: Genre, Length: 54214, dtype: object

In [21]:
x_test = x_test.drop(['ID','Genre'],axis=1)

In [23]:
x_test

Unnamed: 0,Title,Description
0,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apart..."
1,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty chi..."
2,Off the Beaten Track (2010),One year in the life of Albin and his family o...
3,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with his..."
4,Er nu zhai (1955),Before he was known internationally as a marti...
...,...,...
54195,"""Tales of Light & Dark"" (2013)","Covering multiple genres, Tales of Light & Dar..."
54196,Der letzte Mohikaner (1965),As Alice and Cora Munro attempt to find their ...
54197,Oliver Twink (2007),"A movie 169 years in the making. Oliver Twist,..."
54198,Slipstream (1973),"Popular, but mysterious rock D.J Mike Mallard ..."


In [134]:
y_test=parse_movie_file('./data/Genre Classification Dataset/test_data_solution.txt')['Genre']

In [27]:
y_test

0           thriller
1             comedy
2        documentary
3              drama
4              drama
            ...     
54195         horror
54196        western
54197          adult
54198          drama
54199          drama
Name: Genre, Length: 54200, dtype: object

# TEXT PROCESSING

In [30]:
import nltk

In [31]:
from nltk.corpus import stopwords

In [32]:
import string
def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [44]:
ans = []

In [46]:
l1=list(x_train['Description'])[0:1000]

In [48]:
for i in l1:
    ans.append(text_process(i))

In [56]:
l1 = list(x_train['Description'])[1000:2000]
for i in l1:
    ans.append(text_process(i))

In [64]:
x_train=x_train[0:10000]

In [138]:
y_train=y_train[0:10000]

In [78]:
x_test=x_test[0:2000]

In [140]:
y_test=y_test[0:2000]

In [92]:
#TF
from sklearn.feature_extraction.text import CountVectorizer

In [94]:
bow_transformer = CountVectorizer(analyzer=text_process).fit(x_train['Description'])

In [96]:
bow_transformer

In [100]:
train_messages_bow = bow_transformer.transform(x_train['Description'])
test_message_bow = bow_transformer.transform(x_test['Description'])

In [102]:
from sklearn.feature_extraction.text import TfidfTransformer

In [106]:
tfidf_transformer = TfidfTransformer().fit(train_messages_bow)

In [108]:
train_tfidf = tfidf_transformer.transform(train_messages_bow)
test_tfidf = tfidf_transformer.transform(test_message_bow)

In [110]:
train_tfidf

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 501203 stored elements and shape (10000, 65787)>

In [112]:
test_tfidf

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 93185 stored elements and shape (2000, 65787)>

In [114]:
from sklearn.naive_bayes import MultinomialNB

In [116]:
model = MultinomialNB()

In [124]:
from sklearn.preprocessing import LabelEncoder
encode = LabelEncoder()
encode.fit(y_train)

In [126]:
y_train = encode.transform(y_train)
y_test = encode.transform(y_test)

In [130]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [142]:
pipeline.fit(x_train['Description'],y_train)

In [156]:
predictions = pipeline.predict(x_test['Description'])

In [148]:
from sklearn.metrics import classification_report

In [158]:
print(classification_report(predictions,y_test))

              precision    recall  f1-score   support

      action       0.00      0.00      0.00         0
       adult       0.00      0.00      0.00         0
   adventure       0.00      0.00      0.00         0
   animation       0.00      0.00      0.00         0
   biography       0.00      0.00      0.00         0
      comedy       0.02      0.83      0.04         6
       crime       0.00      0.00      0.00         0
 documentary       0.88      0.53      0.66       776
       drama       0.89      0.39      0.54      1218
      family       0.00      0.00      0.00         0
     fantasy       0.00      0.00      0.00         0
   game-show       0.00      0.00      0.00         0
     history       0.00      0.00      0.00         0
      horror       0.00      0.00      0.00         0
       music       0.00      0.00      0.00         0
     musical       0.00      0.00      0.00         0
     mystery       0.00      0.00      0.00         0
        news       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


array(['documentary', 'drama'], dtype='<U11')

In [162]:
x_test

Unnamed: 0,Title,Description
0,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apart..."
1,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty chi..."
2,Off the Beaten Track (2010),One year in the life of Albin and his family o...
3,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with his..."
4,Er nu zhai (1955),Before he was known internationally as a marti...
...,...,...
1995,L'absence (2009/II),"Adama Diop is a successful scientist, living i..."
1996,Blood Circus (2017),"Sean ""THE KILLIN'"" Dillon was on top of the wo..."
1997,Gangsta Brown Master of the Game (2016),This DVD consists of 22 fictional characters i...
1998,Treasure of the Golden Condor (1953),Jean-Paul rebels against his bondage to his un...


In [166]:
import joblib
joblib.dump(pipeline,'pipeline.pkl')

['pipeline.pkl']

In [168]:
#EXIT MODEL DONE AND SAVED