In [31]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report


# Load the data
def load_data(file_path, test=False):
    if test:
        data = pd.read_csv(file_path, delimiter=' ::: ', engine='python', names=['ID', 'TITLE', 'DESCRIPTION'])
    else:
        data = pd.read_csv(file_path, delimiter=' ::: ', engine='python', names=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'])
    return data

# Preprocess the data
def preprocess_data(data):
    data['DESCRIPTION'] = data['DESCRIPTION'].str.lower()
    return data

# Converting text to features using TF-IDF
def text_to_features(description, vectorizer=None, fit=False):
    if fit:
        vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
        X = vectorizer.fit_transform(description)
    else:
        X = vectorizer.transform(description)
    return X, vectorizer

# Train the model
def train_model(X, y, model_type):
    if model_type == 'logistic_regression':
        model = LogisticRegression(max_iter=1000)
    elif model_type == 'naive_bayes':
        model = MultinomialNB()
    model.fit(X, y)
    return model

# Evaluate the model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, zero_division=0)
    return accuracy, report

# Predicting Genre for Test Data
def predict_genre(model, X_test):
    predictions = model.predict(X_test)
    return predictions


Loading and preprocessing the Training Data

In [32]:
train_file_path = './data/train_data.txt'

train_data = load_data(train_file_path)
train_data = preprocess_data(train_data)
train_data.head()


Unnamed: 0,ID,TITLE,GENRE,DESCRIPTION
0,1,Oscar et la dame rose (2009),drama,listening in to a conversation between his doc...
1,2,Cupid (1997),thriller,a brother and sister with a past incestuous re...
2,3,"Young, Wild and Wonderful (1980)",adult,as the bus empties the students for their fiel...
3,4,The Secret Sin (1915),drama,to help their unemployed father make ends meet...
4,5,The Unrecovered (2007),drama,the film's title refers not only to the un-rec...


Converting training text to feeatures

In [33]:
X_train, vectorizer = text_to_features(train_data['DESCRIPTION'], fit=True)
y_train = train_data['GENRE']

Split training data for evaluation

In [34]:
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

Train and evaluate Logistic Regression

In [35]:
lr_model = train_model(X_train_split, y_train_split, 'logistic_regression')
lr_accuracy, lr_report = evaluate_model(lr_model, X_val_split, y_val_split)
print("Logistic Regression - Accuracy:", lr_accuracy)
print("Logistic Regression - Classification Report:", lr_report)

Logistic Regression - Accuracy: 0.5794521811306834
Logistic Regression - Classification Report:               precision    recall  f1-score   support

      action       0.52      0.26      0.35       263
       adult       0.75      0.21      0.33       112
   adventure       0.42      0.14      0.21       139
   animation       0.56      0.09      0.15       104
   biography       0.00      0.00      0.00        61
      comedy       0.51      0.58      0.55      1443
       crime       0.29      0.02      0.04       107
 documentary       0.66      0.84      0.74      2659
       drama       0.54      0.78      0.64      2697
      family       0.39      0.07      0.12       150
     fantasy       0.00      0.00      0.00        74
   game-show       0.94      0.42      0.59        40
     history       0.00      0.00      0.00        45
      horror       0.64      0.56      0.60       431
       music       0.62      0.47      0.54       144
     musical       1.00      0.02      

Train and evaluate Naive Bayes

In [36]:
nb_model = train_model(X_train_split, y_train_split, 'naive_bayes')
nb_accuracy, nb_report = evaluate_model(nb_model, X_val_split, y_val_split)
print("Naive Bayes - Accuracy:", nb_accuracy)
print("Naive Bayes - Classification Report:", nb_report)

Naive Bayes - Accuracy: 0.52310246241815
Naive Bayes - Classification Report:               precision    recall  f1-score   support

      action       0.57      0.08      0.14       263
       adult       0.88      0.06      0.12       112
   adventure       0.29      0.03      0.05       139
   animation       0.00      0.00      0.00       104
   biography       0.00      0.00      0.00        61
      comedy       0.51      0.44      0.47      1443
       crime       0.00      0.00      0.00       107
 documentary       0.58      0.88      0.70      2659
       drama       0.46      0.83      0.59      2697
      family       1.00      0.01      0.01       150
     fantasy       0.00      0.00      0.00        74
   game-show       1.00      0.15      0.26        40
     history       0.00      0.00      0.00        45
      horror       0.73      0.38      0.50       431
       music       0.79      0.10      0.18       144
     musical       0.00      0.00      0.00        50
   

Loading and preprocessing Test Data

In [37]:
test_file_path = './data/test_data.txt'

test_data = load_data(test_file_path, test=True)
test_data = preprocess_data(test_data)
test_data.head()

Unnamed: 0,ID,TITLE,DESCRIPTION
0,1,Edgar's Lunch (1998),"l.r. brane loves his life - his car, his apart..."
1,2,La guerra de papá (1977),"spain, march 1964: quico is a very naughty chi..."
2,3,Off the Beaten Track (2010),one year in the life of albin and his family o...
3,4,Meu Amigo Hindu (2015),"his father has died, he hasn't spoken with his..."
4,5,Er nu zhai (1955),before he was known internationally as a marti...


Converting test text to features

In [38]:
X_test, _ = text_to_features(test_data['DESCRIPTION'], vectorizer=vectorizer, fit=False)

Predicting genre using the best model (In this case Logistic Regression)

In [39]:
predictions = predict_genre(lr_model, X_test)
result = pd.DataFrame({'ID': test_data['ID'], 'TITLE': test_data['TITLE'],'PREDICTED GENRE': predictions})
print(result)

          ID                           TITLE PREDICTED GENRE
0          1            Edgar's Lunch (1998)           short
1          2        La guerra de papá (1977)           drama
2          3     Off the Beaten Track (2010)     documentary
3          4          Meu Amigo Hindu (2015)           drama
4          5               Er nu zhai (1955)           drama
...      ...                             ...             ...
54195  54196  "Tales of Light & Dark" (2013)           drama
54196  54197     Der letzte Mohikaner (1965)          action
54197  54198             Oliver Twink (2007)          comedy
54198  54199               Slipstream (1973)           drama
54199  54200       Curitiba Zero Grau (2010)     documentary

[54200 rows x 3 columns]


Saving result to a file

In [40]:
result.to_csv('predicted_genres.csv', index=False, sep='-')