In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import log_loss
import xgboost as xgb
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sentence_transformers import SentenceTransformer


  from tqdm.autonotebook import tqdm, trange


# 1. Preprocess Data

In [2]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

#Since mac sucks use a subset
train_data = train_data.sample(1000, random_state=42)
test_data = test_data.sample(1000, random_state=42)

# Fill missing values with empty strings
train_data['question1'].fillna('', inplace=True)
train_data['question2'].fillna('', inplace=True)
test_data['question1'].fillna('', inplace=True)
test_data['question2'].fillna('', inplace=True)

# Function to preprocess text
def preprocess_text(text):
    text = re.sub(r'\W', ' ', str(text))  # Remove non-word characters
    text = text.lower()  # Convert to lowercase
    return text.strip()  # Remove leading/trailing whitespace

# Apply preprocessing
train_data['question1'] = train_data['question1'].apply(preprocess_text)
train_data['question2'] = train_data['question2'].apply(preprocess_text)
test_data['question1'] = test_data['question1'].apply(preprocess_text)
test_data['question2'] = test_data['question2'].apply(preprocess_text)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['question1'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['question2'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting va

Steps:
- Removing non-word characters using regular expressions.
- Converting text to lowercase for uniformity.
- Splitting the text into tokens (words).

# 2. **Sentence** Embeddings

In [3]:

# This class loads the SBERT model, which is pre-trained to generate embeddings that capture the semantic meaning of sentences.
sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def encode_sentence(sentence, model):
    if sentence == '':
        print("Empty sentence encountered, returning zero vector.")
        return np.zeros(384) 
    return model.encode(sentence)
# If the sentence is not empty, it uses the SBERT model to generate a 384-dimensional embedding that captures the sentence's semantic meaning.


train_data['q1_sbert'] = train_data['question1'].apply(lambda x: encode_sentence(x, sbert_model))
train_data['q2_sbert'] = train_data['question2'].apply(lambda x: encode_sentence(x, sbert_model))
test_data['q1_sbert'] = test_data['question1'].apply(lambda x: encode_sentence(x, sbert_model))
test_data['q2_sbert'] = test_data['question2'].apply(lambda x: encode_sentence(x, sbert_model))

# Calculate cosine similarity between SBERT embeddings
train_data['sbert_sim'] = train_data.apply(lambda row: cosine_similarity([row['q1_sbert']], [row['q2_sbert']])[0][0], axis=1)
test_data['sbert_sim'] = test_data.apply(lambda row: cosine_similarity([row['q1_sbert']], [row['q2_sbert']])[0][0], axis=1)





**Why Calculate Cosine Similarity?**

Measure Semantic Similarity: Cosine similarity quantifies how close two vectors are in the high-dimensional space. For sentence embeddings, this translates to how similar the meanings of two sentences are. A higher cosine similarity indicates that the sentences are more similar in meaning.

Feature for Classification: In the context of the Quora question pairs problem, the cosine similarity between the embeddings of two questions can be used as a feature to help a machine learning model determine whether the questions are duplicates.

- Output Shape: When cosine_similarity is called with these inputs, the result is a 1x1 matrix, where the only entry represents the cosine similarity between the single vector in the first list and the single vector in the second list.
- Extracting the Scalar Value: To get this single similarity value, you need to extract the first (and only) element from this 1x1 matrix. This is achieved with [0][0].

# 3. Additional Features

In [4]:
# Combine all questions for TF-IDF vectorization
all_questions = pd.concat([train_data['question1'], train_data['question2'], test_data['question1'], test_data['question2']], axis=0)

# TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=10000)
tfidf.fit(all_questions)

train_q1_tfidf = tfidf.transform(train_data['question1'])
train_q2_tfidf = tfidf.transform(train_data['question2'])
test_q1_tfidf = tfidf.transform(test_data['question1'])
test_q2_tfidf = tfidf.transform(test_data['question2'])

# Create feature vectors by taking the absolute difference of TF-IDF vectors
train_features_tfidf = abs(train_q1_tfidf - train_q2_tfidf)
test_features_tfidf = abs(test_q1_tfidf - test_q2_tfidf)

# Calculate number of common words and normalize by total unique words

def common_words(q1, q2):
    return len(set(q1.split()) & set(q2.split()))

train_data['common_words'] = train_data.apply(lambda row: common_words(row['question1'], row['question2']), axis=1)
test_data['common_words'] = test_data.apply(lambda row: common_words(row['question1'], row['question2']), axis=1)

**Common Words:** \
Purpose: Measures the number of words that are shared between the two questions.\
Meaning: A higher number of common words indicates that the questions might be asking about the same topic.

**Common Word Ratio:** \
Purpose: Normalizes the common word count by the total number of unique words in both questions.\
Meaning: This ratio gives a normalized measure of word overlap, accounting for the length of the questions.

# 4.1 Modeling

In [5]:
# Combine features
train_features = pd.DataFrame()
train_features['sbert_sim'] = train_data['sbert_sim']
train_features['common_words'] = train_data['common_words']
# train_features['common_word_ratio'] = train_data['common_word_ratio']

test_features = pd.DataFrame()
test_features['sbert_sim'] = test_data['sbert_sim']
test_features['common_words'] = test_data['common_words']
# test_features['common_word_ratio'] = test_data['common_word_ratio']

# Add TF-IDF features
tfidf_train_features_df = pd.DataFrame(train_features_tfidf.toarray())
tfidf_test_features_df = pd.DataFrame(test_features_tfidf.toarray())

train_features = pd.concat([train_features.reset_index(drop=True), tfidf_train_features_df.reset_index(drop=True)], axis=1)
test_features = pd.concat([test_features.reset_index(drop=True), tfidf_test_features_df.reset_index(drop=True)], axis=1)

# Prepare feature matrix and target variable
X = train_features
y = train_data['is_duplicate']

# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


# 4.2 XGBoost

In [6]:
# Train XGBoost model
xgb_model = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
xgb_model.fit(X_train, y_train)

# Validate the model
val_predictions_xgb = xgb_model.predict_proba(X_val)[:, 1]
print(f'Validation Log Loss (XGBoost): {log_loss(y_val, val_predictions_xgb)}')


Validation Log Loss (XGBoost): 0.4156219731899407


# 4.3 NN

In [9]:
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import BatchNormalization

# Define the improved neural network model
nn_model = Sequential([
    Dense(512, activation='relu', input_shape=(X_train.shape[1],)),
    BatchNormalization(), 
    Dropout(0.4), 
    Dense(256, activation='relu'),
    BatchNormalization(),
    Dropout(0.4),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.4),
    Dense(1, activation='sigmoid')
])
"""
Explanation of neural network structure:
- Dense Layers: Each Dense layer is a fully connected layer where every neuron is connected to every neuron in the next layer.
- Activation Function: relu (Rectified Linear Unit) is used for hidden layers because it helps with non-linearity and is computationally efficient.
- Batch Normalization: Added after each hidden layer to normalize the inputs to each layer.
- Dropout: Set to 40% to ignore 40% of the neurons during training, which helps prevent overfitting.
- Output Layer: Uses sigmoid activation for binary classification, producing a probability between 0 and 1.

"""

# Compile the model
nn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Add callbacks for better training
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.001)
early_stop = EarlyStopping(monitor='val_loss', patience=5)
model_checkpoint = ModelCheckpoint('best_model.keras', save_best_only=True, monitor='val_loss', mode='min')

"""
- ReduceLROnPlateau: Reduces the learning rate by a factor of 0.2 if the validation loss does not improve for 3 epochs, with a minimum learning rate of 0.001.
- EarlyStopping: Stops training if the validation loss does not improve for 5 epochs.
- ModelCheckpoint: Saves the model with the best validation loss during training.
"""

# Train the neural network model
nn_model.fit(X_train, y_train, epochs=50, batch_size=256, validation_data=(X_val, y_val),
             callbacks=[reduce_lr, early_stop, model_checkpoint],verbose = 0)

# Load the best model
nn_model.load_weights('best_model.keras')

# Validate the model
val_predictions_nn = nn_model.predict(X_val).flatten()
print(f'Validation Log Loss (Neural Network): {log_loss(y_val, val_predictions_nn)}')


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 
Validation Log Loss (Neural Network): 0.6259429905183549


What are Callbacks?

Callbacks are functions that can be applied at different stages of the training process. They are used to customize the behavior of the training loop.

Why use Callbacks?

- Reduce Learning Rate on Plateau: The ReduceLROnPlateau callback reduces the learning rate when the validation loss stops improving. This helps the model converge better by taking smaller steps during training.
- Early Stopping: The EarlyStopping callback stops training when the validation loss stops improving. This prevents the model from overfitting by stopping training at the optimal point.
- Model Checkpointing: The ModelCheckpoint callback saves the best model during training based on validation loss. This ensures that you keep the best-performing model.

Epochs and Batch Size:
- Exploration Phase: Initially, we set 50 epochs to allow the model enough time to learn from the data. However, we use the EarlyStopping callback to prevent overfitting by stopping the training early if the validation loss does not improve.
- Early Stopping: This means the model will likely not train for all 50 epochs. Instead, it will stop as soon as the validation loss stops improving, making the number of epochs more flexible and dependent on the data and the model's learning behavior.

What is Batch Size?

The batch size is the number of training samples used to compute the gradient update during training. It defines how many samples to use before updating the model's internal parameters.


# 5. Prediction

In [8]:
# Make predictions using XGBoost model
test_predictions_xgb = xgb_model.predict_proba(test_features)[:, 1]

# Make predictions using Neural Network model
test_predictions_nn = nn_model.predict(test_features).flatten()

# Average predictions
test_predictions = (test_predictions_xgb + test_predictions_nn) / 2

# Prepare submission file
submission = pd.DataFrame({'test_id': test_data['test_id'], 'is_duplicate': test_predictions})
submission.to_csv('submission.csv', index=False)
submission = pd.DataFrame({'test_id': test_data['test_id'], 'is_duplicate': test_predictions_xgb})
submission.to_csv('submission_xgb.csv', index=False)
submission = pd.DataFrame({'test_id': test_data['test_id'], 'is_duplicate': test_predictions_nn})
submission.to_csv('submission_nn.csv', index=False)


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
