In [9]:
import pandas as pd
data = []
with open('msr_paraphrase_train.txt', 'r') as file:
    # Skip the header line
    next(file)
    for line in file:
        split_line = line.strip().split('\t')
        if len(split_line) == 5:  # Assuming there should be 5 columns based on your example
            data.append(split_line)
        else:
            print(f"Skipping line due to incorrect number of columns: {line}")

columns = ["Quality", "#1 ID", "#2 ID", "#1 String", "#2 String"]
df = pd.DataFrame(data, columns=columns)


data = []
with open('msr_paraphrase_test.txt', 'r') as file:
    # Skip the header line
    next(file)
    for line in file:
        split_line = line.strip().split('\t')
        if len(split_line) == 5:  # Assuming there should be 5 columns based on your example
            data.append(split_line)
        else:
            print(f"Skipping line due to incorrect number of columns: {line}")

columns = ["Quality", "#1 ID", "#2 ID", "#1 String", "#2 String"]
df_test = pd.DataFrame(data, columns=columns)



In [4]:
df.head()

Unnamed: 0,Quality,#1 ID,#2 ID,#1 String,#2 String
0,1,702876,702977,"Amrozi accused his brother, whom he called ""th...","Referring to him as only ""the witness"", Amrozi..."
1,0,2108705,2108831,Yucaipa owned Dominick's before selling the ch...,Yucaipa bought Dominick's in 1995 for $693 mil...
2,1,1330381,1330521,They had published an advertisement on the Int...,"On June 10, the ship's owners had published an..."
3,0,3344667,3344648,"Around 0335 GMT, Tab shares were up 19 cents, ...","Tab shares jumped 20 cents, or 4.6%, to set a ..."
4,1,1236820,1236712,"The stock rose $2.11, or about 11 percent, to ...",PG&E Corp. shares jumped $1.63 or 8 percent to...


In [5]:
df_test.head()

Unnamed: 0,Quality,#1 ID,#2 ID,#1 String,#2 String
0,1,1089874,1089925,"PCCW's chief operating officer, Mike Butcher, ...",Current Chief Operating Officer Mike Butcher a...
1,1,3019446,3019327,The world's two largest automakers said their ...,Domestic sales at both GM and No. 2 Ford Motor...
2,1,1945605,1945824,According to the federal Centers for Disease C...,The Centers for Disease Control and Prevention...
3,0,1430402,1430329,A tropical storm rapidly developed in the Gulf...,A tropical storm rapidly developed in the Gulf...
4,0,3354381,3354396,The company didn't detail the costs of the rep...,But company officials expect the costs of the ...


# 2. Preprocessing

In [10]:
import re

# Function to clean text data
def clean_text(text):
    text = text.lower()  # Lowercase
    # text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Apply text cleaning
df['#1 String Cleaned'] = df['#1 String'].apply(clean_text)
df['#2 String Cleaned'] = df['#2 String'].apply(clean_text)


In [11]:
!pip install nltk
import nltk
from nltk.corpus import stopwords
from collections import Counter
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

# Function to remove stop words from a text
def remove_stop_words(text):
    return ' '.join([word for word in text.split() if word.lower() not in stop_words])

# Apply the function to your dataset
df['#1 String Cleaned No Stop'] = df['#1 String Cleaned'].apply(remove_stop_words)
df['#2 String Cleaned No Stop'] = df['#2 String Cleaned'].apply(remove_stop_words)



[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')


lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

df['#1 String Lemmatized'] = df['#1 String Cleaned'].apply(lemmatize_text)
df['#2 String Lemmatized'] = df['#2 String Cleaned'].apply(lemmatize_text)

[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/jovyan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


# 3. Feature Engineering

In [13]:
# Sentence length difference
df['Length Difference'] = abs(df['#1 String Lemmatized'].str.split().apply(len) - df['#2 String Lemmatized'].str.split().apply(len))

# Common words ratio
def common_words_ratio(row):
    set1 = set(row['#1 String Lemmatized'].split())
    set2 = set(row['#2 String Lemmatized'].split())
    return len(set1.intersection(set2)) / len(set1.union(set2))

df['Common Words Ratio'] = df.apply(common_words_ratio, axis=1)


Sentence Length Difference:

Why: Differences in sentence length can provide a simple yet effective signal of similarity. Paraphrases tend to have similar lengths.


Common Words Ratio:

Why: This feature measures the overlap of words between the two sentences, normalized by the total number of unique words. This can be a strong indicator of similarity.



In [14]:
# Function to calculate word overlap
def word_overlap(row):
    set1 = set(row['#1 String Cleaned No Stop'].split())
    set2 = set(row['#2 String Cleaned No Stop'].split())
    overlap = set1.intersection(set2)
    return len(overlap)

# Function to calculate Jaccard similarity
def jaccard_similarity(row):
    set1 = set(row['#1 String Cleaned No Stop'].split())
    set2 = set(row['#2 String Cleaned No Stop'].split())
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union)

# Apply features
df['Word Overlap'] = df.apply(word_overlap, axis=1)
df['Jaccard Similarity'] = df.apply(jaccard_similarity, axis=1)


**Word Overlap**\
The word_overlap function calculates the number of common words between the two sentences in each row. It gives an idea of how many words are shared between the sentences, which can be a strong indicator of similarity.

**Jaccard Similarity**
- measures the similarity between two sets. It is defined as the size of the intersection divided by the size of the union of the sets.
- Jaccard similarity treats the text as sets of words and measures the overlap relative to the total unique words. This is suitable for paraphrases as it focuses on the presence or absence of words, ignoring their order.
- "Jaccard similarity helps us quantify the overlap between the words used in two pieces of text by comparing the sets of unique words. This is crucial for identifying how much of the vocabulary is shared between the paraphrases, providing a clear measure of lexical similarity."


In [15]:
!pip install sentence_transformers
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

import numpy as np


# Load pre-trained SBERT model
sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

sentences1 = df['#1 String Cleaned'].tolist()
sentences2 = df['#2 String Cleaned'].tolist()
embeddings1 = sbert_model.encode(sentences1)
embeddings2 = sbert_model.encode(sentences2)

embedding_sim = [cosine_similarity([emb1], [emb2])[0][0] for emb1, emb2 in zip(embeddings1, embeddings2)]

# Add Embedding Similarity as a feature
df['Embedding Similarity'] = embedding_sim






2024-06-27 14:11:10.235402: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-06-27 14:11:10.416871: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-27 14:11:11.013055: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/nvidia/lib:/usr/local/nvidia/lib64
2024-06-27 14:11:11.013143: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinf

- Cosine similarity is well-suited for high-dimensional data
- When using word embeddings like SBERT, cosine similarity captures semantic similarities, even if the words themselves are different.

"Cosine similarity helps us understand how similar two pieces of text are in terms of their overall meaning and context by comparing the direction and magnitude of their word frequency vectors or embeddings. This is particularly useful for identifying semantically similar paraphrases that may use different words but convey the same meaning."

# 4. Modeling


In [16]:
from sklearn.model_selection import train_test_split

# Combine embeddings (concatenate embeddings for each pair)

X_embeddings = np.hstack((embeddings1, embeddings2))
additional_features = df[['Word Overlap', 'Jaccard Similarity','Embedding Similarity','Common Words Ratio','Length Difference']].values

X_combined = np.hstack((X_embeddings, additional_features))

y = df['Quality'].astype(int)

X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)


## XGBoost

In [20]:
# !pip install xgboost
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}

xgb_model = XGBClassifier(random_state=42)
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, 
                           cv=5, n_jobs=-1, verbose=0, scoring='accuracy')
grid_search.fit(X_train, y_train)
best_xgb_model = grid_search.best_estimator_


In [21]:
y_pred_xgb = best_xgb_model.predict(X_test)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
report_xgb = classification_report(y_test, y_pred_xgb)

print("XGBoost Model Results:")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy_xgb:.2f}")
print("\nClassification Report:\n")
print(report_xgb)

XGBoost Model Results:
Best parameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'subsample': 1.0}
Accuracy: 0.78

Classification Report:

              precision    recall  f1-score   support

           0       0.66      0.52      0.58       240
           1       0.82      0.89      0.85       576

    accuracy                           0.78       816
   macro avg       0.74      0.70      0.71       816
weighted avg       0.77      0.78      0.77       816



## RF

In [22]:
import os
import warnings
import joblib
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Set environment variable to suppress tokenizers parallelism warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Suppress specific FutureWarnings
warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn.ensemble._forest")

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_features': ['sqrt'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Initialize Grid Search with cross-validation
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, 
                           cv=5, n_jobs=-1, verbose=0, scoring='accuracy')

# Perform the grid search
try:
    grid_search.fit(X_train, y_train)
except Exception as e:
    print(f"An error occurred during Grid Search: {e}")

# Check if the grid search was successful
if grid_search.best_estimator_:
    # Get the best hyperparameters
    best_params = grid_search.best_params_
    print("Best parameters found: ", best_params)

    # Train the best model
    best_rf_model = grid_search.best_estimator_
    best_rf_model.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = best_rf_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    
    joblib.dump(best_rf_model, 'best_rf_model.pkl')
    
    print(f"Accuracy: {accuracy:.2f}")
    print("\nClassification Report:\n")
    print(report)
else:
    print("Grid Search did not complete successfully.")


Best parameters found:  {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}
Accuracy: 0.76

Classification Report:

              precision    recall  f1-score   support

           0       0.74      0.26      0.39       240
           1       0.76      0.96      0.85       576

    accuracy                           0.76       816
   macro avg       0.75      0.61      0.62       816
weighted avg       0.75      0.76      0.71       816



**Explanation of output**
- Precision: The ratio of correctly predicted positive observations to the total predicted positives. It indicates how many of the predicted positive instances are actually positive.
- Recall: The ratio of correctly predicted positive observations to the all observations in actual class. It indicates how many of the actual positive instances are correctly predicted.
- F1-score: The weighted average of Precision and Recall. It takes both false positives and false negatives into account. It is especially useful when you have an uneven class distribution.


The model performs significantly better at identifying false paraphrases (non-paraphrases) than true paraphrases (paraphrases).\
The recall for true paraphrases is quite low, indicating the model misses many actual true paraphrase instances.\
The overall accuracy is 71%, meaning the model correctly classifies the pairs 71% of the time.\
The weighted averages suggest that the model performs reasonably well when considering the class distribution, but improvements are needed for better precision and recall for the true paraphrase class.