<a href="https://colab.research.google.com/github/realYashJoshi/PS4-Epochers50/blob/main/training_and_inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**EPOCHERS 50 PS4**

TRAINING FILE

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import xgboost as xgb
from sklearn.metrics import accuracy_score
import joblib
import re


data = pd.read_csv('training_data.csv')


train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)
#Basic text Preprocessing
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

#Count vectorizer or Bag of words method used for feauture extraction
vectorizer = CountVectorizer(max_features=5000, stop_words='english')

X_train = vectorizer.fit_transform(train_data['Question'] + '[SEP] ' + train_data['Paragraph'])
y_train = train_data['Answer_possible']
#Here we used a XGB classifier algorithm which works exceptionally well on Tabular data
classifier = xgb.XGBClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train, y_train)


X_val = vectorizer.transform(val_data['Question'] + '[SEP] ' + val_data['Paragraph'])
y_val = val_data['Answer_possible']

actual_yes_count = np.sum(y_val == 1)
actual_no_count = np.sum(y_val == 0)

print(f"Actual 'Yes' Count: {actual_yes_count}")
print(f"Actual 'No' Count: {actual_no_count}")


val_predictions = classifier.predict(X_val)
val_accuracy = accuracy_score(y_val, val_predictions)
print(f"Validation Accuracy: {val_accuracy:.2f}")


num_predicted_yes = np.sum(val_predictions == 1)
num_predicted_no = np.sum(val_predictions == 0)

print(f"Predicted 'Yes' Count: {num_predicted_yes}")
print(f"Predicted 'No' Count: {num_predicted_no}")
#Saving and Exporting the model
joblib.dump(classifier, 'xgboost_classifier.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

Actual 'Yes' Count: 6371
Actual 'No' Count: 2801
Validation Accuracy: 0.70
Predicted 'Yes' Count: 8350
Predicted 'No' Count: 822


['vectorizer.pkl']

INFERENCE FILE

In [11]:
import pandas as pd
import xgboost as xgb
import joblib
import re
from transformers import pipeline, DistilBertTokenizer, TFDistilBertForQuestionAnswering
import tensorflow as tf

#  Here we load the trained XGBoost model and CountVectorizer
classifier = joblib.load('xgboost_classifier.pkl')
vectorizer = joblib.load('vectorizer.pkl')

def preprocess_text(text): #For text Preprocessing
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

# Load the training data
training_data = pd.read_csv('training_data.csv')

# Select random 500 entries for testing as test data wasnt specified in problem statement
test_data = training_data.sample(n=500, random_state=42)

# Initialize a DistilBERT tokenizer and model for question answering, this is used for Transer-Learning process which uses SQUAD Model.
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased-distilled-squad')
model = TFDistilBertForQuestionAnswering.from_pretrained('distilbert-base-cased-distilled-squad')

# Initializing a question answering pipeline
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

# Initializing counters
count_predictions_yes = 0
count_predictions_no = 0

# Initializing lists to store the results
questions = []
paragraphs = []
answer_starts = []
answer_texts = []

# Iterating through the testing data
for index, row in test_data.iterrows():
    question = row['Question']
    paragraph = row['Paragraph']

    # Preprocessing the text
    question = preprocess_text(question)
    paragraph = preprocess_text(paragraph)

    # Vectorizing the input using the CountVectorizer
    input_text = question + '[SEP] ' + paragraph
    X_test = vectorizer.transform([input_text])

    # Making predictions using the XGBoost model
    prediction = classifier.predict(X_test)[0]

    if prediction == 1:
        count_predictions_yes += 1
        # If the prediction is 'Yes' (1),  we use the question answering pipeline
        answer = qa_pipeline(question=question, context=paragraph)

        # Extract the answer text and answer_start
        answer_text = answer['answer']
        answer_start = paragraph.find(answer_text) if answer_text else None
    else:
        count_predictions_no += 1
        # If the prediction is 'No' (0),  we leave the answer fields empty
        answer_text = None
        answer_start = None

    questions.append(question)
    paragraphs.append(paragraph)
    answer_starts.append(answer_start)
    answer_texts.append(answer_text)

# To Create DataFrame with the results
results_df = pd.DataFrame({
    'Question': questions,
    'Paragraph': paragraphs,
    'Answer_start': answer_starts,
    'Answer_text': answer_texts
})

# Saving the results to a CSV file
results_df.to_csv('predictions.csv', index=False)

# Printing  the counts of predictions '1' and '0'
print(f"Number of Predictions '1': {count_predictions_yes}")
print(f"Number of Predictions '0': {count_predictions_no}")


All PyTorch model weights were used when initializing TFDistilBertForQuestionAnswering.

All the weights of TFDistilBertForQuestionAnswering were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForQuestionAnswering for predictions without further training.


Number of Predictions '1': 461
Number of Predictions '0': 39
