<a href="https://colab.research.google.com/github/sauravkrpal/headline-generation-system/blob/main/LogicLoomTask.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install rouge
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=d2bf84474a8606797cc45399bd5a73c441146a57841d507c34a7a6674a780a7c
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


**Note:** Below is a **basic** Python code using a simple machine learning pipeline for the LogicLooM 3.0 ML Contest problem statement. This example employs a bag-of-words representation for text and a basic regression model to approximate captions.

Participants, please understand that this will serve as **just a foundational framework for the beginners** It highlights the process of training, evaluation, and generating predictions for submission.

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import re

# Step 5: Evaluating the Model - Function definition moved to the top
def evaluate_metrics(predictions, references):
    """
    Calculates BLEU and ROUGE-L scores for predictions and references.
    """
    bleu_scores = []
    rouge_scorer_obj = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    rouge_l_scores = []

    for pred, ref in zip(predictions, references):
        pred_tokens = pred.split()
        ref_tokens = ref.split()

        # BLEU Score
        bleu_score = sentence_bleu([ref_tokens], pred_tokens)
        bleu_scores.append(bleu_score)

        # ROUGE-L Score
        rouge_score = rouge_scorer_obj.score(ref, pred)['rougeL'].fmeasure
        rouge_l_scores.append(rouge_score)

    avg_bleu = sum(bleu_scores) / len(bleu_scores)
    avg_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)

    return avg_bleu, avg_rouge_l

# Step 1: Loading the datasets
train_file = 'LABELLED_TRAIN.csv'
val_file = 'LABELLED_DEV.csv'
test_file = 'UNLABELLED_TEST.csv'  # To be used once the test set is released

train_data = pd.read_csv(train_file)
val_data = pd.read_csv(val_file)

# Load test data only if it exists
try:
    test_data = pd.read_csv(test_file)
except FileNotFoundError:
    print("Test file not found. Proceeding without test data.")
    test_data = None

# Step 2: Preprocessing Function
def preprocess_text(text):
    """
    Preprocesses the input text:
    - Removes special characters
    - Converts to lowercase
    """
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    return text

# Preprocess the articles and captions
train_data['News Article'] = train_data['News Article'].apply(preprocess_text)
val_data['News Article'] = val_data['News Article'].apply(preprocess_text)
train_data['Caption'] = train_data['Caption'].apply(preprocess_text)
val_data['Caption'] = val_data['Caption'].apply(preprocess_text)

if test_data is not None:
    test_data['News Article'] = test_data['News Article'].apply(preprocess_text)

# Convert captions to TF-IDF representations
y_vectorizer = TfidfVectorizer(max_features=5000)  # New TF-IDF vectorizer for the captions
y_train = train_data['Caption']
y_val = val_data['Caption']

y_train_tfidf = y_vectorizer.fit_transform(y_train).toarray()
y_val_tfidf = y_vectorizer.transform(y_val).toarray()

# Step 3: Feature Extraction using TF-IDF
# TF-IDF: Term Frequency-Inverse Document Frequency, a feature representation for text
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_data['News Article'])
X_val = vectorizer.transform(val_data['News Article'])

if test_data is not None:
    X_test = vectorizer.transform(test_data['News Article'])

# Training Ridge Regression with numeric targets
model = Ridge(alpha=1.0)
model.fit(X_train, y_train_tfidf)

# Predicting on validation data
val_predictions_tfidf = model.predict(X_val)

# Decoding predictions back to text (approximation)
# Using the feature names to decode predictions
def decode_tfidf_predictions(predictions, vectorizer):
    """Convert predicted TF-IDF values back to approximate text."""
    feature_names = vectorizer.get_feature_names_out()
    decoded_predictions = []
    for pred_vector in predictions:
        # Take the top N terms with the highest weights
        top_indices = pred_vector.argsort()[-10:][::-1]  # Top 10 terms
        words = [feature_names[i] for i in top_indices]
        decoded_predictions.append(" ".join(words))
    return decoded_predictions

# Decode predictions
val_predictions_text = decode_tfidf_predictions(val_predictions_tfidf, y_vectorizer)

# Evaluate the model
val_bleu, val_rouge_l = evaluate_metrics(val_predictions_text, y_val)
print(f"Validation Set Performance - BLEU: {val_bleu:.4f}, ROUGE-L: {val_rouge_l:.4f}")


# Step 6: Predicting on Test Data (if available)
if test_data is not None:
    test_predictions_tfidf = model.predict(X_test)
    test_predictions_text = decode_tfidf_predictions(test_predictions_tfidf, y_vectorizer)
    test_data['Prediction'] = test_predictions_text

    # Save predictions to CSV
    output_file = '{{265}}.csv'
    test_data[['ID', 'Prediction']].to_csv(output_file, index=False)
    print(f"Predictions saved to {output_file}")

print("Good to goooo!")

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Validation Set Performance - BLEU: 0.0008, ROUGE-L: 0.1417
Predictions saved to {{265}}.csv
Good to goooo!


In [None]:
!pip install --upgrade keras tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer


# Step 1: Loading the datasets
train_file = 'LABELLED_TRAIN.csv'
val_file = 'LABELLED_DEV.csv'

train_data = pd.read_csv(train_file)
val_data = pd.read_csv(val_file)

# Step 2: Preprocessing Function
def preprocess_text(text):
    """
    Preprocesses the input text:
    - Removes special characters
    - Converts to lowercase
    """
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    return text

# Preprocess the articles and captions
train_data['News Article'] = train_data['News Article'].apply(preprocess_text)
val_data['News Article'] = val_data['News Article'].apply(preprocess_text)
train_data['Caption'] = train_data['Caption'].apply(preprocess_text)
val_data['Caption'] = val_data['Caption'].apply(preprocess_text)

# Tokenize the text data
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data['News Article'])
X_train_seq = tokenizer.texts_to_sequences(train_data['News Article'])
X_val_seq = tokenizer.texts_to_sequences(val_data['News Article'])

# Padding sequences to ensure uniform input length
max_len = max([len(seq) for seq in X_train_seq])  # Could also be a fixed value
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len)

# Convert captions to TF-IDF representations
y_vectorizer = TfidfVectorizer(max_features=5000)  # New TF-IDF vectorizer for the captions
y_train = train_data['Caption']
y_val = val_data['Caption']

y_train_tfidf = y_vectorizer.fit_transform(y_train).toarray()
y_val_tfidf = y_vectorizer.transform(y_val).toarray()


# Building the LSTM model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=max_len))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dense(5000, activation='linear')) # Output size must match the target size of the vectorizer

model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_squared_error']) # Changed the loss

# Train the model
model.fit(X_train_pad, y_train_tfidf, epochs=5, batch_size=64, validation_data=(X_val_pad, y_val_tfidf))





Epoch 1/5
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 2s/step - loss: 2.0008e-04 - mean_squared_error: 2.0008e-04 - val_loss: 1.9869e-04 - val_mean_squared_error: 1.9869e-04
Epoch 2/5
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 2s/step - loss: 1.9959e-04 - mean_squared_error: 1.9959e-04 - val_loss: 1.9871e-04 - val_mean_squared_error: 1.9871e-04
Epoch 3/5
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 2s/step - loss: 1.9961e-04 - mean_squared_error: 1.9961e-04 - val_loss: 1.9868e-04 - val_mean_squared_error: 1.9868e-04
Epoch 4/5
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 2s/step - loss: 1.9953e-04 - mean_squared_error: 1.9953e-04 - val_loss: 1.9859e-04 - val_mean_squared_error: 1.9859e-04
Epoch 5/5
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 2s/step - loss: 1.9950e-04 - mean_squared_error: 1.9950e-04 - val_loss: 1.9862e-04 - val_mean_squared_error: 1.9862e-04


<keras.src.callbacks.history.History at 0x7e2dbec669b0>

In [None]:
!pip install --upgrade keras tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
import re
import numpy as np

# Step 1: Loading the datasets
train_file = 'LABELLED_TRAIN.csv'
val_file = 'LABELLED_DEV.csv'

train_data = pd.read_csv(train_file)
val_data = pd.read_csv(val_file)

# Step 2: Preprocessing Function
def preprocess_text(text):
    """
    Preprocesses the input text:
    - Removes special characters
    - Converts to lowercase
    """
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    return text

# Preprocess the articles and captions
train_data['News Article'] = train_data['News Article'].apply(preprocess_text)
val_data['News Article'] = val_data['News Article'].apply(preprocess_text)
train_data['Caption'] = train_data['Caption'].apply(preprocess_text) #preprocess captions too
val_data['Caption'] = val_data['Caption'].apply(preprocess_text)


# Tokenize the text data
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data['News Article'])
X_train_seq = tokenizer.texts_to_sequences(train_data['News Article'])
X_val_seq = tokenizer.texts_to_sequences(val_data['News Article'])

# Padding sequences to ensure uniform input length
max_len = max([len(seq) for seq in X_train_seq])
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_val_pad = pad_sequences(X_val_seq, maxlen=max_len)


# Create binary labels - let's see if the caption contains 'man' or not
y_train = np.array([1 if 'man' in caption else 0 for caption in train_data['Caption']])
y_val = np.array([1 if 'man' in caption else 0 for caption in val_data['Caption']])



# Building the LSTM model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=max_len))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))  # Assuming binary classification

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train_pad, y_train, epochs=5, batch_size=64, validation_data=(X_val_pad, y_val))

Epoch 1/5




[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 2s/step - accuracy: 0.9083 - loss: 0.3921 - val_accuracy: 0.9260 - val_loss: 0.2638
Epoch 2/5
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 2s/step - accuracy: 0.9296 - loss: 0.2543 - val_accuracy: 0.9260 - val_loss: 0.2714
Epoch 3/5
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 2s/step - accuracy: 0.9422 - loss: 0.2187 - val_accuracy: 0.9250 - val_loss: 0.2621
Epoch 4/5
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 2s/step - accuracy: 0.9495 - loss: 0.1472 - val_accuracy: 0.9100 - val_loss: 0.3056
Epoch 5/5
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 2s/step - accuracy: 0.9814 - loss: 0.0598 - val_accuracy: 0.9140 - val_loss: 0.4117


<keras.src.callbacks.history.History at 0x7e2db7f77130>

In [None]:
# Step 6: Evaluating the Model
val_loss, val_accuracy = model.evaluate(X_val_pad, y_val)
print(f"Validation Loss: {val_loss:.4f}")
print(f"Validation Accuracy: {val_accuracy:.4f}")
from sklearn.metrics import classification_report
import numpy as np

# Get predictions from the model
val_predictions_prob = model.predict(X_val_pad)

# Convert the probabilities to binary predictions
val_predictions_binary = (val_predictions_prob > 0.5).astype(int)

# Get classification report (Precision, Recall, F1-Score)
print(classification_report(y_val, val_predictions_binary))
# Step 8: Saving the model
model.save('lstm_model.h5')

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 418ms/step - accuracy: 0.9163 - loss: 0.4117
Validation Loss: 0.4117
Validation Accuracy: 0.9140
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 362ms/step




              precision    recall  f1-score   support

           0       0.93      0.98      0.95       926
           1       0.20      0.05      0.09        74

    accuracy                           0.91      1000
   macro avg       0.56      0.52      0.52      1000
weighted avg       0.87      0.91      0.89      1000



In [None]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# Step 1: Adjusting the Learning Rate
new_lr = 0.0005  # Adjust learning rate
optimizer = Adam(learning_rate=new_lr)

# Step 2: Adding More Layers / Increasing Model Complexity
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=max_len))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.3))  # Increased dropout rate to prevent overfitting
model.add(LSTM(128, return_sequences=True))  # Adding another LSTM layer
model.add(Dropout(0.3))  # Dropout after the second LSTM layer
model.add(LSTM(128))  # Another LSTM layer
model.add(Dense(1, activation='sigmoid'))  # Binary classification output layer

# Compile the model with the adjusted learning rate
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Step 3: Using Early Stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Step 4: Train the Model with Adjusted Parameters and Early Stopping
model.fit(X_train_pad, y_train, epochs=10, batch_size=32, validation_data=(X_val_pad, y_val), callbacks=[early_stopping])

# After training, evaluate the model on the validation set
val_loss, val_accuracy = model.evaluate(X_val_pad, y_val)
print(f"Validation Loss: {val_loss}")
print(f"Validation Accuracy: {val_accuracy}")


Epoch 1/10




[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m197s[0m 2s/step - accuracy: 0.9165 - loss: 0.3587 - val_accuracy: 0.9260 - val_loss: 0.2741
Epoch 2/10
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m204s[0m 2s/step - accuracy: 0.9360 - loss: 0.2338 - val_accuracy: 0.9260 - val_loss: 0.3058
Epoch 3/10
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m192s[0m 2s/step - accuracy: 0.9644 - loss: 0.1047 - val_accuracy: 0.9240 - val_loss: 0.3127
Epoch 4/10
[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 2s/step - accuracy: 0.9923 - loss: 0.0304 - val_accuracy: 0.9120 - val_loss: 0.4730
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 526ms/step - accuracy: 0.9221 - loss: 0.2863
Validation Loss: 0.2741192579269409
Validation Accuracy: 0.9259999990463257


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import re

# Step 5: Evaluating the Model - Function definition moved to the top
def evaluate_metrics(predictions, references):
    """
    Calculates BLEU and ROUGE-L scores for predictions and references.
    """
    bleu_scores = []
    rouge_scorer_obj = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    rouge_l_scores = []

    for pred, ref in zip(predictions, references):
        pred_tokens = pred.split()
        ref_tokens = ref.split()

        # BLEU Score
        bleu_score = sentence_bleu([ref_tokens], pred_tokens)
        bleu_scores.append(bleu_score)

        # ROUGE-L Score
        rouge_score = rouge_scorer_obj.score(ref, pred)['rougeL'].fmeasure
        rouge_l_scores.append(rouge_score)

    avg_bleu = sum(bleu_scores) / len(bleu_scores)
    avg_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)

    return avg_bleu, avg_rouge_l

# Step 1: Loading the datasets
train_file = 'LABELLED_TRAIN.csv'
val_file = 'LABELLED_DEV.csv'
test_file = 'UNLABELLED_TEST.csv'  # To be used once the test set is released

train_data = pd.read_csv(train_file)
val_data = pd.read_csv(val_file)

# Load test data only if it exists
try:
    test_data = pd.read_csv(test_file)
except FileNotFoundError:
    print("Test file not found. Proceeding without test data.")
    test_data = None

# Step 2: Preprocessing Function
def preprocess_text(text):
    """
    Preprocesses the input text:
    - Removes special characters
    - Converts to lowercase
    """
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    return text

# Preprocess the articles and captions
train_data['News Article'] = train_data['News Article'].apply(preprocess_text)
val_data['News Article'] = val_data['News Article'].apply(preprocess_text)
train_data['Caption'] = train_data['Caption'].apply(preprocess_text)
val_data['Caption'] = val_data['Caption'].apply(preprocess_text)

if test_data is not None:
    test_data['News Article'] = test_data['News Article'].apply(preprocess_text)

# Convert captions to TF-IDF representations
y_vectorizer = TfidfVectorizer(max_features=5000)  # New TF-IDF vectorizer for the captions
y_train = train_data['Caption']
y_val = val_data['Caption']

y_train_tfidf = y_vectorizer.fit_transform(y_train).toarray()
y_val_tfidf = y_vectorizer.transform(y_val).toarray()

# Step 3: Feature Extraction using TF-IDF
# TF-IDF: Term Frequency-Inverse Document Frequency, a feature representation for text
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_data['News Article'])
X_val = vectorizer.transform(val_data['News Article'])

if test_data is not None:
    X_test = vectorizer.transform(test_data['News Article'])

# Training Ridge Regression with numeric targets
model = Ridge(alpha=1.0)
model.fit(X_train, y_train_tfidf)

# Predicting on validation data
val_predictions_tfidf = model.predict(X_val)

# Decoding predictions back to text (approximation)
# Using the feature names to decode predictions
def decode_tfidf_predictions(predictions, vectorizer):
    """Convert predicted TF-IDF values back to approximate text."""
    feature_names = vectorizer.get_feature_names_out()
    decoded_predictions = []
    for pred_vector in predictions:
        # Take the top N terms with the highest weights
        top_indices = pred_vector.argsort()[-10:][::-1]  # Top 10 terms
        words = [feature_names[i] for i in top_indices]
        decoded_predictions.append(" ".join(words))
    return decoded_predictions

# Decode predictions
val_predictions_text = decode_tfidf_predictions(val_predictions_tfidf, y_vectorizer)

# Evaluate the model
val_bleu, val_rouge_l = evaluate_metrics(val_predictions_text, y_val)
print(f"Validation Set Performance - BLEU: {val_bleu:.4f}, ROUGE-L: {val_rouge_l:.4f}")


# Step 6: Predicting on Test Data (if available)
if test_data is not None:
    test_predictions_tfidf = model.predict(X_test)
    test_predictions_text = decode_tfidf_predictions(test_predictions_tfidf, y_vectorizer)
    test_data['Prediction'] = test_predictions_text

    # Prepare the submission DataFrame
    submission = pd.DataFrame({
        'ID': [f"TEST_{i+1}" for i in range(len(test_data))],  # Create unique IDs (TEST_1, TEST_2, ...)
        'Prediction': test_predictions_text
    })

    # Save predictions to CSV
    output_file = 'team_id.csv'  # Replace with your team ID in the file name
    submission.to_csv(output_file, index=False)
    print(f"Predictions saved to {output_file}")

print("Good to goooo!")


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Validation Set Performance - BLEU: 0.0008, ROUGE-L: 0.1417
Predictions saved to team_id.csv
Good to goooo!


In [None]:
# Step 6: Predicting on Test Data (if available)
if test_data is not None:
    # Generate predictions using the trained model
    test_predictions_tfidf = model.predict(X_test)
    test_predictions_text = decode_tfidf_predictions(test_predictions_tfidf, y_vectorizer)

    # Add predictions to the test data DataFrame
    test_data['Prediction'] = test_predictions_text

    # Create a submission DataFrame with the unique IDs and Predictions
    submission = pd.DataFrame({
        'ID': [f"TEST_{i+1}" for i in range(len(test_data))],  # Generating unique IDs (TEST_1, TEST_2, ...)
        'Prediction': test_predictions_text
    })

    # Save the predictions to a CSV file
    output_file = '265.csv'  # Replace 'team_id' with your actual team ID for submission
    submission.to_csv(output_file, index=False)
    print(f"Predictions saved to {output_file}")


Predictions saved to team_id.csv
