# Natural Language Project - Finding the Genre of Movie Plots - Group 11

## Project Setup
If you're running this project in **Google Colab**, make sure to execute the following commands to properly configure the environment.
(These steps are not required if you're running the project locally on your machine.)

In [None]:
!git clone https://github.com/rspecker/NLP.git
%cd NLP

In [None]:
# !pip install -r requirements.txt

In [None]:
! pip install sentence-transformers

In [3]:
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [None]:
# import project dependencies
import pandas as pd
from preprocessing.embeddings import create_sentence_embeddings
from preprocessing.preproc import create_preprocesssed_dataset
from preprocessing.tfidf import create_train_data_tfidf
from utils import create_train_test_sets
from modelling.information_ret import score

## Data Import

In [8]:
# Import data and set column names
df = pd.read_table(
    'train.txt',
    names=['title', 'from', 'genre', 'director', 'plot']
    )

## Data Pre-Processing

In [9]:
# Split data into training and testing sets
x_train, x_test, y_train, y_test = create_train_test_sets(
    df, test_size=0.2, random_state=0, y_column='genre'
)

### TF-IDF
TF-IDF converts text into sparse vectors based on word importance across documents, without considering word order or context

In [None]:
# create TF-IDF

### Feature-based Transfer Learning with Sentence Embeddings (Sentence-BERT)
Sentence-BERT generates dense, fixed-length embeddings that capture the semantic meaning of entire sentences, enabling more context-aware comparisons.

In [None]:
# create sentence embeddings for testing data
x_train_sentence_embeddings = create_sentence_embeddings(
    sentences=x_train["plot"].to_list(),
    model="all-MiniLM-L6-v2")

In [11]:
# create sentence embeddings for testing data
x_test_sentence_embeddings = create_sentence_embeddings(
    sentences=x_test["plot"].to_list(),
    model="all-MiniLM-L6-v2")

## Training

### TF-IDF

Feature-based Transfer Learning with Sentence Embeddings (Sentence-BERT)

In [None]:
for model_name, (model, param_grid) in models.items():
    # Perform GridSearchCV
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)

    # Get the best parameters and score
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    # Test the best model on the test set
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    # Calculate accuracy and classification report
    test_accuracy = accuracy_score(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)

    # Create a text file for each model's results
    result_file_path = f'results/{model_name}_grid_search_results.txt'
    with open(result_file_path, 'w') as result_file:
        result_file.write(f"Model: {model_name}\n")
        result_file.write(f"Best Parameters: {best_params}\n")
        result_file.write(f"Best Cross-Validation Score: {best_score:.4f}\n")
        result_file.write(f"Test Accuracy: {test_accuracy:.4f}\n")
        result_file.write(f"Classification Report:\n{classification_rep}\n")

    # Create confusion matrix
    cm = confusion_matrix(y_test, y_pred)

    # Plot confusion matrix
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=np.unique(y_test))
    disp.plot(cmap=plt.cm.Blues)

    # Save confusion matrix as an image
    plt.title(f'Confusion Matrix for {model_name}')
    plt.savefig(f'results/{model_name}_confusion_matrix.png')
    plt.close()

In [None]:
vectorizer = TfidfVectorizer(max_features=1000)
x_train_tfidf = vectorizer.fit_transform(x_train["plot"])
x_test_tfidf = vectorizer.transform(x_test["plot"])

model = MultinomialNB()
params, score = tune_hyperparameters(model, mnb_param_grid, x_train_tfidf, y_train)

final_model = MultinomialNB(alpha=params['alpha'],
                            fit_prior=params['fit_prior'])
final_model.fit(x_train_tfidf, y_train)

final_model.score(x_test_tfidf, y_test)
print(f"Test set accuracy: {final_model.score(x_test_tfidf, y_test)}")

model = SVC()
params, score = tune_hyperparameters(model, svc_param_grid, x_train_tfidf, y_train)

final_model = MultinomialNB(C=params['C'],
                            kernel=params['kernel'],
                            # gamma=params['gamma'],
                            # class_weight=params['class_weight']
                            )
final_model.fit(x_train_tfidf, y_train)

final_model.score(x_test_tfidf, y_test)
print(f"Test set accuracy: {final_model.score(x_test_tfidf, y_test)}")