In [3]:
import pandas as pd
def load_and_preprocess(filepath, is_test=False):
    data = pd.read_csv(filepath, delimiter=' ::: ', engine='python', header=None)
    if is_test:
        data.columns = ['ID', 'TITLE', 'DESCRIPTION']
    else:
        data.columns = ['ID', 'TITLE', 'GENRE', 'DESCRIPTION']
    data['DESCRIPTION'] = data['DESCRIPTION'].str.lower().str.replace('[^\w\s]', '', regex=True)
    return data


In [4]:
train_data = load_and_preprocess('train_data.txt')
test_data = load_and_preprocess('test_data.txt', is_test=True)
test_solution_data = load_and_preprocess('test_data_solution.txt')

In [10]:
print(test_data.shape)
print(test_solution_data.shape)

(54200, 3)
(54200, 4)


In [41]:
import pandas as pd
import warnings 
warnings.filterwarnings('ignore')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train = tfidf_vectorizer.fit_transform(train_data['DESCRIPTION'])
X_test = tfidf_vectorizer.transform(test_data['DESCRIPTION'])

# Train the model using Multinomial Naive Bayes
model = MultinomialNB()
model.fit(X_train, train_data['GENRE'])

# Predict the genres for the test set
y_pred = model.predict(X_test)
print("Shape of test data:", y_test.shape)
print("Shape of predicted labels:", y_pred.shape)

# Evaluate the model
y_test = test_solution_data['GENRE']
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Shape of test data: (54200,)
Shape of predicted labels: (54200,)
Accuracy: 0.5121217712177122
Classification Report:
               precision    recall  f1-score   support

      action       0.59      0.06      0.11      1314
       adult       0.45      0.03      0.06       590
   adventure       0.72      0.04      0.08       775
   animation       0.00      0.00      0.00       498
   biography       0.00      0.00      0.00       264
      comedy       0.52      0.39      0.45      7446
       crime       0.00      0.00      0.00       505
 documentary       0.56      0.88      0.69     13096
       drama       0.45      0.83      0.58     13612
      family       1.00      0.00      0.00       783
     fantasy       0.00      0.00      0.00       322
   game-show       0.97      0.16      0.27       193
     history       0.00      0.00      0.00       243
      horror       0.73      0.27      0.40      2204
       music       0.73      0.10      0.18       731
     musical     

In [50]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train = tfidf_vectorizer.fit_transform(train_data['DESCRIPTION'])
X_test = tfidf_vectorizer.transform(test_data['DESCRIPTION'])

# Train the model using Logistic Regression
model = LogisticRegression(max_iter=200)
model.fit(X_train, train_data['GENRE'])

# Predict the genres for the test set
y_pred = model.predict(X_test)

# Evaluate the model
y_test = test_solution_data['GENRE']
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.5836715867158672
Classification Report:
               precision    recall  f1-score   support

      action       0.47      0.28      0.35      1314
       adult       0.59      0.23      0.33       590
   adventure       0.57      0.16      0.25       775
   animation       0.48      0.06      0.10       498
   biography       0.00      0.00      0.00       264
      comedy       0.53      0.59      0.55      7446
       crime       0.33      0.03      0.06       505
 documentary       0.67      0.85      0.75     13096
       drama       0.54      0.77      0.64     13612
      family       0.49      0.08      0.14       783
     fantasy       0.55      0.06      0.10       322
   game-show       0.89      0.50      0.64       193
     history       0.00      0.00      0.00       243
      horror       0.64      0.57      0.60      2204
       music       0.64      0.43      0.52       731
     musical       0.25      0.02      0.03       276
     mystery       0.33     