In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import nltk
import re
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from collections import Counter
from nltk.corpus import wordnet
import string

# **Data Preparation**

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [7]:
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


 **Functions to clean the data**

In [None]:
def is_ponctuation(word):
    return all(caractere in string.punctuation for caractere in word)

In [None]:
def is_multiple_white_spaces(token):
    return all(caracter.isspace() for caracter in token)

In [None]:
def cleaning_data(text):

  tokens= word_tokenize(text)

  #converting to lower case
  tokens = [token.lower() for token in tokens]

  #performing stemming
  ps = PorterStemmer()
  tokens = [ps.stem(token) for token in tokens]

  #removing ponctuation
  tokens = [token for token in tokens if not is_ponctuation(token)]

  #removing multiple white spaces
  tokens = [token if not is_multiple_white_spaces(token) else ' ' for token in tokens]

  stop_words = set(stopwords.words('english'))
  tokens = [token for token in tokens if token not in stop_words ]

  # Reconstructing the text from tokens
  cleaned_text = ' '.join(tokens)

  return cleaned_text

**Reading the data**

In [None]:
df_train_data = pd.read_csv('train_data.txt', delimiter=':::' , header=None, engine='python')
print(df_train_data.head())
df_train_data.shape

   0                                   1           2  \
0  1       Oscar et la dame rose (2009)       drama    
1  2                       Cupid (1997)    thriller    
2  3   Young, Wild and Wonderful (1980)       adult    
3  4              The Secret Sin (1915)       drama    
4  5             The Unrecovered (2007)       drama    

                                                   3  
0   Listening in to a conversation between his do...  
1   A brother and sister with a past incestuous r...  
2   As the bus empties the students for their fie...  
3   To help their unemployed father make ends mee...  
4   The film's title refers not only to the un-re...  


(54214, 4)

**Extracting the text description and the labels**

In [None]:
Xt = df_train_data.iloc[:, 3]
Yt = df_train_data.iloc[:, 2]

**Cleaning the data**

In [None]:
Xt = [cleaning_data(text) for text in Xt]

**Mapping the labels**

In [None]:
unique_values = Yt.unique()
print(unique_values)

value_to_number = {value: idx for idx, value in enumerate(unique_values)}
print(value_to_number)

Yt_mapped = Yt.map(value_to_number).values.reshape(-1, 1)

[' drama ' ' thriller ' ' adult ' ' documentary ' ' comedy ' ' crime '
 ' reality-tv ' ' horror ' ' sport ' ' animation ' ' action ' ' fantasy '
 ' short ' ' sci-fi ' ' music ' ' adventure ' ' talk-show ' ' western '
 ' family ' ' mystery ' ' history ' ' news ' ' biography ' ' romance '
 ' game-show ' ' musical ' ' war ']
{' drama ': 0, ' thriller ': 1, ' adult ': 2, ' documentary ': 3, ' comedy ': 4, ' crime ': 5, ' reality-tv ': 6, ' horror ': 7, ' sport ': 8, ' animation ': 9, ' action ': 10, ' fantasy ': 11, ' short ': 12, ' sci-fi ': 13, ' music ': 14, ' adventure ': 15, ' talk-show ': 16, ' western ': 17, ' family ': 18, ' mystery ': 19, ' history ': 20, ' news ': 21, ' biography ': 22, ' romance ': 23, ' game-show ': 24, ' musical ': 25, ' war ': 26}


**Saving the intermediate result in csv files**

In [None]:
import numpy as np


Yt_mapped_df = pd.DataFrame(Yt_mapped)

Xt_df = pd.DataFrame(Xt)



Yt_mapped_df.to_csv('Yt_mapped.csv', index=False, header=False)
Xt_df.to_csv('Xt.csv', index=False, header=False)

# **Text representation using the TF-IDF technique**

**Reload the data to continue the preprocessing**

In [1]:
import pandas as pd

Xt_df = pd.read_csv('Xt.csv', header=None)
Xt = Xt_df.values
Xt = [' '.join(tokens) for tokens in Xt]


**Extracting the features**

In [2]:
from sklearn.feature_extraction.text import HashingVectorizer



n_features =10000
v = HashingVectorizer(n_features=n_features, alternate_sign=False)


transformed_Xt = v.transform(Xt)
transformed_Xt = transformed_Xt.toarray()

# **Building the model**

**Reload the data to start the processing**

In [None]:
transformed_Xt_df = pd.read_csv('transformed_Xt.csv', header=None)

transformed_Xt = transformed_Xt_df.values

In [3]:
Yt_mapped_df = pd.read_csv('Yt_mapped.csv', header=None)
Yt_mapped = Yt_mapped_df.values

In [4]:
from imblearn.over_sampling import SMOTE
import numpy as np

smote = SMOTE(random_state=42)
transformed_Xt, Yt_mapped = smote.fit_resample(transformed_Xt[:5000, :], Yt_mapped[:5000, :])


transformed_Xt = np.clip(transformed_Xt, a_min=0, a_max=None)

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


X_train, X_val, y_train, y_val = train_test_split(transformed_Xt, Yt_mapped, test_size=0.2, random_state=42)

In [9]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'alpha': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0],
    'fit_prior': [True, False],
}

grid_search = GridSearchCV(MultinomialNB(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train.ravel())

best_model = grid_search.best_estimator_
print("Best Parameters: ", grid_search.best_params_)


Best Parameters:  {'alpha': 0.3, 'fit_prior': False}


In [5]:

nb_model = MultinomialNB(alpha=0.36, fit_prior=False)


nb_model.fit(X_train, y_train.ravel())


y_pred = nb_model.predict(X_val)


accuracy = accuracy_score(y_val, y_pred)

print(f'Validation Accuracy: {accuracy * 100:.2f}%')

Validation Accuracy: 95.43%


**Evaluating the model**

In [6]:
from sklearn.metrics import classification_report

print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.66      0.57      0.61       231
           1       0.88      0.99      0.93       246
           2       0.99      1.00      0.99       237
           3       0.79      0.78      0.79       250
           4       0.83      0.76      0.79       262
           5       1.00      1.00      1.00       272
           6       0.99      1.00      0.99       243
           7       0.92      1.00      0.95       245
           8       0.99      1.00      0.99       271
           9       0.99      1.00      1.00       241
          10       0.96      0.98      0.97       268
          11       1.00      1.00      1.00       259
          12       0.88      0.71      0.79       270
          13       0.97      1.00      0.99       233
          14       0.99      1.00      0.99       249
          15       0.98      1.00      0.99       248
          16       0.98      1.00      0.99       231
          17       0.98    