In [1]:
# This will delete the entire folder and everything inside it.
!rm -r question-answer-bert-squad

rm: cannot remove 'question-answer-bert-squad': No such file or directory


In [2]:
# Cell 1: Configure Git and Clone Repository
import os
from kaggle_secrets import UserSecretsClient

# Get GitHub token from Kaggle Secrets
user_secrets = UserSecretsClient()
github_token = user_secrets.get_secret("github_token")

# Configure Git with your email and username
!git config --global user.email "soumyagrandhi145@gmail.com"
!git config --global user.name "soumya"

# Clone the repository using the token
repo_url = f"https://soumya-grandhi:{github_token}@github.com/soumya-grandhi/question-answer-bert-squad.git"
!git clone {repo_url}

# Change directory into the cloned repo
os.chdir("question-answer-bert-squad")
print("Changed working directory to:", os.getcwd())
!ls -a

Cloning into 'question-answer-bert-squad'...
remote: Enumerating objects: 13, done.[K
remote: Counting objects: 100% (13/13), done.[K
remote: Compressing objects: 100% (10/10), done.[K
remote: Total 13 (delta 1), reused 13 (delta 1), pack-reused 0 (from 0)[K
Receiving objects: 100% (13/13), 992.12 KiB | 10.12 MiB/s, done.
Resolving deltas: 100% (1/1), done.
Changed working directory to: /kaggle/working/question-answer-bert-squad
.  ..  .git  outputs  requirements.txt


In [3]:
# Cell 2: Create requirements.txt and install packages using standard Python
requirements_content = """
torch
transformers>=4.40
datasets
pandas
"""

with open("requirements.txt", "w") as f:
    f.write(requirements_content.strip())
    
print("Successfully created requirements.txt")
!cat requirements.txt

Successfully created requirements.txt
torch
transformers>=4.40
datasets
pandas

In [4]:
# Cell 3: Install from requirements.txt
!pip install -r requirements.txt

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->-r requirements.txt (line 1))
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->-r requirements.txt (line 1))
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->-r requirements.txt (line 1))
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch->-r requirements.txt (line 1))
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch->-r requirements.txt (line 1))
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch->-r requirements.txt (line 1))
  Downl

In [5]:
# Cell 4: Create project directories
import os

project_dirs = ["notebooks", "src", "outputs"]
for dir_name in project_dirs:
    os.makedirs(dir_name, exist_ok=True)
    
print("Created directories:", [d for d in os.listdir() if os.path.isdir(d)])

Created directories: ['notebooks', 'outputs', 'src', '.git']


In [6]:
# Cell 5: Stage, commit, and push to GitHub
!git add .
!git commit -m "feat: initial project setup and requirements"
!git push origin main

On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean
Everything up-to-date


2. DATA LOADING AND INITIAL EXPLORATION

2.1 Load the Squad Dataset

In [7]:
# Cell 1: Load SQuAD dataset
import os
from datasets import load_dataset, Dataset

# The datasets library automatically downloads and caches the data
squad_dataset = load_dataset("squad_v2")

# Print the dataset structure
print(squad_dataset)

# Show a sample from the training split
print("\nSample from training split:")
print(squad_dataset["train"][0])

README.md: 0.00B [00:00, ?B/s]

squad_v2/train-00000-of-00001.parquet:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

squad_v2/validation-00000-of-00001.parqu(…):   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})

Sample from training split:
{'id': '56be85543aeaaa14008c9063', 'title': 'Beyoncé', 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard 

2.2 Basic Data Exploration

In [8]:
# Cell 2: Explore dataset statistics
train_data = squad_dataset["train"]
val_data = squad_dataset["validation"]

# Number of samples
print(f"Number of training examples: {len(train_data)}")
print(f"Number of validation examples: {len(val_data)}")

# Get the distribution of answer lengths
train_data_df = train_data.to_pandas()
answer_lengths = train_data_df['answers'].apply(lambda x: len(x['text'][0]) if x['text'] else 0)
print("\nAnswer length statistics:")
print(answer_lengths.describe())

# Check for unanswerable questions in SQuAD v2
unanswerable_questions = sum(1 for answers in train_data['answers'] if len(answers['text']) == 0)
print(f"\nNumber of unanswerable questions in training data: {unanswerable_questions}")
print(f"Percentage of unanswerable questions: {unanswerable_questions / len(train_data) * 100:.2f}%")

Number of training examples: 130319
Number of validation examples: 11873


  answer_lengths = train_data_df['answers'].apply(lambda x: len(x['text'][0]) if x['text'] else 0)



Answer length statistics:
count    130319.00000
mean         13.42376
std          20.01709
min           0.00000
25%           0.00000
50%           7.00000
75%          17.00000
max         239.00000
Name: answers, dtype: float64

Number of unanswerable questions in training data: 43498
Percentage of unanswerable questions: 33.38%


2.3 Save a Processed Sample

In [9]:
# Cell 3: Save a small sample of the dataset
# This is useful for rapid prototyping and debugging
small_train_data = train_data.select(range(10000))
small_val_data = val_data.select(range(1000))

# Save the sampled datasets to a local directory
small_train_data.save_to_disk("./outputs/small_train_data")
small_val_data.save_to_disk("./outputs/small_val_data")

print("Saved small training and validation subsets to the 'outputs' directory.")

Saving the dataset (0/1 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

Saved small training and validation subsets to the 'outputs' directory.


3. DATA PREPROCESSING FOR BERT

3.1 Load Tokenizer and Define Preprocessing Function

In [10]:
# Cell 1: Load the tokenizer
import os
from transformers import AutoTokenizer

model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

print("Tokenizer loaded successfully!")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Tokenizer loaded successfully!


In [11]:
# Cell 2: Define the preprocessing function
from datasets import load_dataset
import numpy as np

# Load the full dataset (not the sample) for training
squad_dataset = load_dataset("squad_v2")

max_length = 384  # The maximum length of a feature (question and context)
doc_stride = 128   # The authorized overlap between two parts of the context

def prepare_train_features(examples):
    # Some questions have lots of whitespace, which we remove
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and padding, but keep the overflows using a stride.
    # This results in one example possible giving several features when a context is long,
    # each of those features having a context that overlaps a bit with the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second", # Truncate the context only
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features, we need a map from a feature to its original example.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label all our examples.
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We find the example to which the current feature belongs.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        
        # If no answers are given, the feature is unanswerable.
        if len(answers["text"]) == 0:
            tokenized_examples["start_positions"].append(0)
            tokenized_examples["end_positions"].append(0)
        else:
            # We get the start and end character positions of the answer in the original text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # We find the start and end token indices of the answer.
            sequence_ids = tokenized_examples.sequence_ids(i)
            
            # Find the start and end of the context
            context_start_token_index = 0
            while sequence_ids[context_start_token_index] != 1:
                context_start_token_index += 1
            
            context_end_token_index = len(sequence_ids) - 1
            while sequence_ids[context_end_token_index] != 1:
                context_end_token_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with 0)
            if not (offsets[context_start_token_index][0] <= start_char and offsets[context_end_token_index][1] >= end_char):
                tokenized_examples["start_positions"].append(0)
                tokenized_examples["end_positions"].append(0)
            else:
                # Otherwise, find the start and end token indices of the answer
                token_start_index = context_start_token_index
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)

                token_end_index = context_end_token_index
                while token_end_index >= 0 and offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

print("Preprocessing function defined.")

Preprocessing function defined.


3.2 Apply the preprocessing to the Dataset

In [12]:
# Cell 3: Apply the function to the dataset
# This will take some time, especially for the training set
tokenized_squad = squad_dataset.map(
    prepare_train_features, 
    batched=True,
    remove_columns=squad_dataset["train"].column_names
)

print("\nDataset tokenization complete!")
print(tokenized_squad)

Map:   0%|          | 0/130319 [00:00<?, ? examples/s]

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]


Dataset tokenization complete!
DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 131754
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 12134
    })
})


3.3 Save the tokenized Dataset

In [13]:
# Cell 4: Save the tokenized dataset to the outputs directory
tokenized_squad["train"].save_to_disk("./outputs/tokenized_squad_train")
tokenized_squad["validation"].save_to_disk("./outputs/tokenized_squad_validation")

print("Tokenized dataset saved to ./outputs/.")

Saving the dataset (0/1 shards):   0%|          | 0/131754 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/12134 [00:00<?, ? examples/s]

Tokenized dataset saved to ./outputs/.


4. Model Training

4.1 Load Dataset and Model

In [14]:
# Cell 4: Load Tokenized Dataset and Model
import os
import torch
from datasets import load_from_disk
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

# Load the tokenized dataset from the 'outputs' directory
tokenized_squad_train = load_from_disk("./outputs/tokenized_squad_train")
tokenized_squad_val = load_from_disk("./outputs/tokenized_squad_validation")

# Load the pre-trained BERT model for question answering
model_checkpoint = "bert-base-uncased"
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

print("Dataset and model loaded successfully!")

2025-09-30 15:39:51.608844: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759246791.804528      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759246791.861315      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Dataset and model loaded successfully!


4.2 Define Training Arguments and Trainer

In [15]:
# Cell 5: Define Training Arguments and Trainer
batch_size = 16
model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    f"{model_name}-finetuned-squad",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=2, # Typically 2-3 epochs is sufficient for SQuAD
    weight_decay=0.01,
    push_to_hub=False,
    report_to="none",
    save_strategy="epoch",
    save_total_limit=1,
)

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_squad_train,
    eval_dataset=tokenized_squad_val,
    tokenizer=tokenizer, # Important: use the same tokenizer as preprocessing
)

print("Trainer initialized.")

  trainer = Trainer(


Trainer initialized.


4.3 Train the Model

In [16]:
# Cell 6: Start training
trainer.train()

print("Training complete!")



Epoch,Training Loss,Validation Loss
1,1.1181,1.080566
2,0.8592,1.125302




Training complete!


4.4 Save the Final Model and Predictions

In [17]:
final_model_path = "./outputs/final_model"
trainer.save_model(final_model_path)
print(f"Final model saved to {final_model_path}")

Final model saved to ./outputs/final_model


5. Model Evaluation and Inference

5.1 Run Evaluation

In [21]:
!pip install evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [23]:
# Cell 8: Run final evaluation (Corrected to use original dataset for text)
from evaluate import load
from transformers import pipeline

# The metrics are part of the HuggingFace 'evaluate' library
metric = load("squad_v2")

# Define the path to your saved model
final_model_path = "./outputs/final_model"

# Create a question-answering pipeline with our fine-tuned model
# Ensure the tokenizer is loaded from the start of the notebook
qa_pipeline = pipeline(
    "question-answering",
    model=final_model_path,
    tokenizer=tokenizer,
    device=0 # Use GPU
)

# Use the ORIGINAL SQuAD validation set for the question and context text
original_val_data = squad_dataset["validation"]

print("\nSample Predictions:")
num_samples = 10
for i in range(num_samples):
    question = original_val_data['question'][i]
    context = original_val_data['context'][i]
    
    # Get prediction from the QA pipeline
    result = qa_pipeline(
        question=question,
        context=context
    )
    
    print(f"Question: {question}")
    print(f"Ground Truth Answer: {original_val_data['answers'][i]['text'] if original_val_data['answers'][i]['text'] else 'Unanswerable'}")
    print(f"Predicted Answer: {result['answer']}")
    print("-" * 20)

Device set to use cuda:0



Sample Predictions:
Question: In what country is Normandy located?
Ground Truth Answer: ['France', 'France', 'France', 'France']
Predicted Answer: France
--------------------
Question: When were the Normans in Normandy?
Ground Truth Answer: ['10th and 11th centuries', 'in the 10th and 11th centuries', '10th and 11th centuries', '10th and 11th centuries']
Predicted Answer: 10th and 11th centuries
--------------------
Question: From which countries did the Norse originate?
Ground Truth Answer: ['Denmark, Iceland and Norway', 'Denmark, Iceland and Norway', 'Denmark, Iceland and Norway', 'Denmark, Iceland and Norway']
Predicted Answer: Denmark, Iceland and Norway
--------------------
Question: Who was the Norse leader?
Ground Truth Answer: ['Rollo', 'Rollo', 'Rollo', 'Rollo']
Predicted Answer: Rollo
--------------------
Question: What century did the Normans first gain their separate identity?
Ground Truth Answer: ['10th century', 'the first half of the 10th century', '10th', '10th']
Pred

5.2 Test with a Custom Example

In [24]:
# Cell 9: Custom inference
custom_context = """
The Amazon River is the largest river by discharge volume of water in the world, and it is a popular destination for tourists. It is located in South America and flows through Brazil, Peru, and Colombia. The river is home to a wide variety of wildlife, including the Amazon river dolphin and many species of fish.
"""
custom_question = "Which continent is the Amazon River in?"

# Use the pipeline to get the answer
result = qa_pipeline(question=custom_question, context=custom_context)

print("Custom Question:", custom_question)
print("Context:", custom_context)
print("\nPredicted Answer:", result['answer'])
print("Confidence Score:", round(result['score'], 4))

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Custom Question: Which continent is the Amazon River in?
Context: 
The Amazon River is the largest river by discharge volume of water in the world, and it is a popular destination for tourists. It is located in South America and flows through Brazil, Peru, and Colombia. The river is home to a wide variety of wildlife, including the Amazon river dolphin and many species of fish.


Predicted Answer: South America
Confidence Score: 0.6236


In [26]:
# Cell 10 (FINAL CLEANED PUSH): Commit artifacts and push
import os

# Ensure we are in the correct directory (optional check)
try:
    os.chdir("question-answer-bert-squad")
except FileNotFoundError:
    pass

# Stage all changes (This stages the trained model and any other artifact/scripts)
!git add .

# Commit with a descriptive message
!git commit -m "feat: FINAL COMMIT - uploaded trained model and evaluation artifacts."

# Push to GitHub
!git push origin main

On branch main
Your branch is ahead of 'origin/main' by 1 commit.
  (use "git push" to publish your local commits)

nothing to commit, working tree clean
Enumerating objects: 26, done.
Counting objects: 100% (26/26), done.
Delta compression using up to 4 threads
Compressing objects: 100% (22/22), done.
^Citing objects:  25% (6/24)


In [28]:
# Final command to send your model and artifacts to GitHub
!git push origin main

Enumerating objects: 26, done.
Counting objects: 100% (26/26), done.
Delta compression using up to 4 threads
Compressing objects: 100% (22/22), done.
Writing objects: 100% (24/24), 1.11 GiB | 23.84 MiB/s, done.
Total 24 (delta 3), reused 1 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (3/3), done.[K
remote: [1;31merror[m: Trace: f0ffd41266500fd338cdfd8899e048c0316ba1fd93b16e9117915167bee3d36d[K
remote: [1;31merror[m: See https://gh.io/lfs for more information.[K
remote: [1;31merror[m: File bert-base-uncased-finetuned-squad/checkpoint-8236/model.safetensors is 415.42 MB; this exceeds GitHub's file size limit of 100.00 MB[K
remote: [1;31merror[m: File outputs/tokenized_squad_train/data-00000-of-00001.arrow is 293.08 MB; this exceeds GitHub's file size limit of 100.00 MB[K
remote: [1;31merror[m: File bert-base-uncased-finetuned-squad/checkpoint-8236/optimizer.pt is 830.95 MB; this exceeds GitHub's file size limit of 100.00 MB[K
remote: [1;31merror[m: GH001: Lar

In [32]:
# Cell 11 (DEFINITIVE FIX): Create .gitignore using standard Python
import os

gitignore_content = """
# Ignore HuggingFace training checkpoints
bert-base-uncased-finetuned-squad/
# Ignore large tokenized datasets
outputs/tokenized_squad_train/
outputs/tokenized_squad_validation/
# Ignore the final saved model in outputs
outputs/final_model/
"""

with open(".gitignore", "w") as f:
    f.write(gitignore_content.strip())

print("Successfully created .gitignore file.")
!cat .gitignore # Verify content

Successfully created .gitignore file.
# Ignore HuggingFace training checkpoints
bert-base-uncased-finetuned-squad/
# Ignore large tokenized datasets
outputs/tokenized_squad_train/
outputs/tokenized_squad_validation/
# Ignore the final saved model in outputs
outputs/final_model/

In [33]:
# Cell 12: Remove large files from Git history and push
# This removes the large files from Git's index without deleting them locally
!git rm -r --cached bert-base-uncased-finetuned-squad/
!git rm -r --cached outputs/tokenized_squad_train/
!git rm -r --cached outputs/tokenized_squad_validation/
!git rm -r --cached outputs/final_model/

# Commit the cleanup and the .gitignore file
!git add .gitignore
!git commit -m "fix: exclude large model and data artifacts from git tracking"

# Final push (This should resolve the large file error and succeed)
!git push origin main

rm 'bert-base-uncased-finetuned-squad/checkpoint-8236/config.json'
rm 'bert-base-uncased-finetuned-squad/checkpoint-8236/model.safetensors'
rm 'bert-base-uncased-finetuned-squad/checkpoint-8236/optimizer.pt'
rm 'bert-base-uncased-finetuned-squad/checkpoint-8236/rng_state.pth'
rm 'bert-base-uncased-finetuned-squad/checkpoint-8236/scheduler.pt'
rm 'bert-base-uncased-finetuned-squad/checkpoint-8236/special_tokens_map.json'
rm 'bert-base-uncased-finetuned-squad/checkpoint-8236/tokenizer.json'
rm 'bert-base-uncased-finetuned-squad/checkpoint-8236/tokenizer_config.json'
rm 'bert-base-uncased-finetuned-squad/checkpoint-8236/trainer_state.json'
rm 'bert-base-uncased-finetuned-squad/checkpoint-8236/training_args.bin'
rm 'bert-base-uncased-finetuned-squad/checkpoint-8236/vocab.txt'
rm 'outputs/tokenized_squad_train/data-00000-of-00001.arrow'
rm 'outputs/tokenized_squad_train/dataset_info.json'
rm 'outputs/tokenized_squad_train/state.json'
rm 'outputs/tokenized_squad_validation/data-00000-of-0000

In [34]:
# Cell 13: Squash last two commits into one clean commit
# This rewrites history to remove the large files completely
!git reset --soft HEAD~2
!git commit -m "feat: complete project code and documentation (EXCLUDING large artifacts)"

[main d7bc50a] feat: complete project code and documentation (EXCLUDING large artifacts)
 1 file changed, 7 insertions(+)
 create mode 100644 .gitignore


In [35]:
# Cell 14: Force push to overwrite remote history with the clean commit
!git push origin main --force

Enumerating objects: 4, done.
Counting objects: 100% (4/4), done.
Delta compression using up to 4 threads
Compressing objects: 100% (3/3), done.
Writing objects: 100% (3/3), 513 bytes | 513.00 KiB/s, done.
Total 3 (delta 0), reused 0 (delta 0), pack-reused 0
To https://github.com/soumya-grandhi/question-answer-bert-squad.git
   8a19a0b..d7bc50a  main -> main
