In [5]:
import os
import numpy as np
import pandas as pd
import nltk
import string
from nltk.stem.porter import PorterStemmer
import re
nltk.download('punkt')
nltk.download('stopwords')
stemmer = PorterStemmer()
stopwords = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# load data
df = pd.read_csv('/content/train.csv.zip')
df.dropna(how="any").reset_index(drop=True)

# train validate test split 70:20:10
from sklearn.model_selection import train_test_split
X_train_q1, X_test_q1, X_train_q2, X_test_q2, y_train, y_test = train_test_split(df['question1'], df['question2'], df['is_duplicate'], test_size=0.3, random_state=42, stratify=df['is_duplicate'])
X_val_q1, X_test_q1, X_val_q2, X_test_q2, y_val, y_test = train_test_split(X_test_q1, X_test_q2, y_test, test_size=(1/3), random_state=42, stratify=y_test)


In [7]:
y_train, y_val, y_test = np.array(y_train), np.array(y_val), np.array(y_test)
# ratio of duplicate questions in train, validation and test set
print("Ratio of duplicate questions in the splits")
print("Train set: ", y_train.sum()/len(y_train))
print("Validation set: ", y_val.sum()/len(y_val))
print("Test set: ", y_test.sum()/len(y_test))

Ratio of duplicate questions in the splits
Train set:  0.36919749967314835
Validation set:  0.36920279997031835
Test set:  0.369190432610255


In [8]:
INPUT = './input/'
TRAIN_LINEAR_PATH = INPUT + 'train_linear.csv.zip'
TEST_LINEAR_PATH = INPUT + 'test_linear.csv.zip'
VAL_LINEAR_PATH = INPUT + 'val_linear.csv.zip'

In [9]:
train_linear = pd.DataFrame({'question1': X_train_q1, 'question2': X_train_q2, 'is_duplicate': y_train})
val_linear = pd.DataFrame({'question1': X_val_q1, 'question2': X_val_q2, 'is_duplicate': y_val})
test_linear = pd.DataFrame({'question1': X_test_q1, 'question2': X_test_q2, 'is_duplicate': y_test})
allQuestions = pd.concat((train_linear['question1'], train_linear['question2'])).reset_index(drop=True).astype(str)

In [10]:
# dump split files
os.makedirs(INPUT, exist_ok=True)
train_linear.to_csv(TRAIN_LINEAR_PATH, index=False, compression='zip')
val_linear.to_csv(VAL_LINEAR_PATH, index=False, compression='zip')
test_linear.to_csv(TEST_LINEAR_PATH, index=False, compression='zip')

In [11]:
train_linear = pd.read_csv(TRAIN_LINEAR_PATH)
val_linear = pd.read_csv(VAL_LINEAR_PATH)
test_linear = pd.read_csv(TEST_LINEAR_PATH)
X_train_q1, X_train_q2, y_train = train_linear['question1'].astype('U').values, train_linear['question2'].astype('U').values, train_linear['is_duplicate'].values
X_val_q1, X_val_q2, y_val = val_linear['question1'].astype('U').values, val_linear['question2'].astype('U').values, val_linear['is_duplicate'].values
X_test_q1, X_test_q2, y_test = test_linear['question1'].astype('U').values, test_linear['question2'].astype('U').values, test_linear['is_duplicate'].values

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from scipy.sparse import hstack as sparse_hstack, vstack as sparse_vstack, save_npz, load_npz

In [13]:
stemmer = PorterStemmer()

def tokenize(text: str) -> list[str]:
    tokens = nltk.word_tokenize(re.sub(r'[^\x00-\x7F]+',' ', text))
    tokens = [stemmer.stem(w) for w in tokens if stemmer.stem(w) not in stopwords]
    return tokens


In [14]:
N_GRAMS_PATH = './n_gram_features/'
UNIGRAM_PATH = N_GRAMS_PATH + 'unigrams_linear/'
BIGRAM_PATH = N_GRAMS_PATH + 'bigrams_linear/'
TRIGRAM_PATH = N_GRAMS_PATH + 'trigrams_linear/'
os.makedirs(N_GRAMS_PATH, exist_ok=True)
os.makedirs(UNIGRAM_PATH, exist_ok=True)
os.makedirs(BIGRAM_PATH, exist_ok=True)
os.makedirs(TRIGRAM_PATH, exist_ok=True)

### Creating Unigram features

In [15]:
unigramVectorizer = CountVectorizer(
                        analyzer='word',
                        ngram_range=(1,1),
                        lowercase=True,
                        tokenizer=tokenize
                    )

unigramVectorizer.fit(allQuestions)
q1_train = unigramVectorizer.transform(train_linear['question1'].astype(str))
q2_train = unigramVectorizer.transform(train_linear['question2'].astype(str))
X_train_unigram = sparse_hstack([q1_train, q2_train])
q1_val = unigramVectorizer.transform(val_linear['question1'].astype(str))
q2_val = unigramVectorizer.transform(val_linear['question2'].astype(str))
X_val_unigram = sparse_hstack([q1_val, q2_val])
q1_test = unigramVectorizer.transform(test_linear['question1'].astype(str))
q2_test = unigramVectorizer.transform(test_linear['question2'].astype(str))
X_test_unigram = sparse_hstack([q1_test, q2_test])

save_npz(UNIGRAM_PATH + "train.npz", X_train_unigram)
save_npz(UNIGRAM_PATH + "val.npz", X_val_unigram)
save_npz(UNIGRAM_PATH + "test.npz", X_test_unigram)



### Creating Bigram features

In [16]:
bigramVectorizer = CountVectorizer(
                        analyzer='word',
                        ngram_range=(1,2),
                        lowercase=True,
                        tokenizer=tokenize
                    )

bigramVectorizer.fit(allQuestions)
q1_train = bigramVectorizer.transform(train_linear['question1'].astype(str))
q2_train = bigramVectorizer.transform(train_linear['question2'].astype(str))
X_train_bigram = sparse_hstack([q1_train, q2_train])
q1_val = bigramVectorizer.transform(val_linear['question1'].astype(str))
q2_val = bigramVectorizer.transform(val_linear['question2'].astype(str))
X_val_bigram = sparse_hstack([q1_val, q2_val])
q1_test = bigramVectorizer.transform(test_linear['question1'].astype(str))
q2_test = bigramVectorizer.transform(test_linear['question2'].astype(str))
X_test_bigram = sparse_hstack([q1_test, q2_test])

save_npz(BIGRAM_PATH + "train.npz", X_train_bigram)
save_npz(BIGRAM_PATH + "val.npz", X_val_bigram)
save_npz(BIGRAM_PATH + "test.npz", X_test_bigram)



### Creating Trigram features

In [17]:
trigramVectorizer = CountVectorizer(
                        analyzer='word',
                        ngram_range=(1,3),
                        lowercase=True,
                        tokenizer=tokenize
                    )

trigramVectorizer.fit(allQuestions)
q1_train = trigramVectorizer.transform(train_linear['question1'].astype(str))
q2_train = trigramVectorizer.transform(train_linear['question2'].astype(str))
X_train_trigram = sparse_hstack([q1_train, q2_train])
q1_val = trigramVectorizer.transform(val_linear['question1'].astype(str))
q2_val = trigramVectorizer.transform(val_linear['question2'].astype(str))
X_val_trigram = sparse_hstack([q1_val, q2_val])
q1_test = trigramVectorizer.transform(test_linear['question1'].astype(str))
q2_test = trigramVectorizer.transform(test_linear['question2'].astype(str))
X_test_trigram = sparse_hstack([q1_test, q2_test])

save_npz(TRIGRAM_PATH + "train.npz", X_train_trigram)
save_npz(TRIGRAM_PATH + "val.npz", X_val_trigram)
save_npz(TRIGRAM_PATH + "test.npz", X_test_trigram)



## Logistic Regression

### Unigrams

In [18]:
# X_train_unigram = load_npz(UNIGRAM_PATH + "train.npz")
# X_test_unigram = load_npz(UNIGRAM_PATH + "test.npz")
unigramLogisticRegressor = SGDClassifier(
                            loss='log_loss',
                            penalty='l2',
                            alpha=0.00001,
                            max_iter=1000,
                            n_iter_no_change=20,
                            learning_rate='optimal',
                            n_jobs=-1,
                            random_state=42)
unigramLogisticRegressor.fit(X_train_unigram, y_train)
y_pred_unigram_logistic = unigramLogisticRegressor.predict(X_test_unigram)
print("Unigram Logistic Regression Accuracy: ", accuracy_score(y_test, y_pred_unigram_logistic))
print("Unigram Logistic Regression F1 Score: ", f1_score(y_test, y_pred_unigram_logistic))

Unigram Logistic Regression Accuracy:  0.7418437260382399
Unigram Logistic Regression F1 Score:  0.6310840903467534


### Bigrams

In [19]:
# X_train_bigram = load_npz(BIGRAM_PATH + "train.npz")
# X_test_bigram = load_npz(BIGRAM_PATH + "test.npz")
bigramLogisticRegressor = SGDClassifier(
                            loss='log_loss',
                            penalty='l2',
                            alpha=0.00001,
                            max_iter=1000,
                            n_iter_no_change=20,
                            learning_rate='optimal',
                            n_jobs=-1,
                            random_state=42)
bigramLogisticRegressor.fit(X_train_bigram, y_train)
y_pred_bigram_logistic = bigramLogisticRegressor.predict(X_test_bigram)
print("bigram Logistic Regression Accuracy: ", accuracy_score(y_test, y_pred_bigram_logistic))
print("bigram Logistic Regression F1 Score: ", f1_score(y_test, y_pred_bigram_logistic))

bigram Logistic Regression Accuracy:  0.7962106408765984
bigram Logistic Regression F1 Score:  0.7066405554566495


### Trigrams

In [20]:
# X_train_trigram = load_npz(TRIGRAM_PATH + "train.npz")
# X_test_trigram = load_npz(TRIGRAM_PATH + "test.npz")
trigramLogisticRegressor = SGDClassifier(
                            loss='log_loss',
                            penalty='l2',
                            alpha=0.00001,
                            max_iter=1000,
                            n_iter_no_change=20,
                            learning_rate='optimal',
                            n_jobs=-1,
                            random_state=42)
trigramLogisticRegressor.fit(X_train_trigram, y_train)
y_pred_trigram_logistic = trigramLogisticRegressor.predict(X_test_trigram)
print("trigram Logistic Regression Accuracy: ", accuracy_score(y_test, y_pred_trigram_logistic))
print("trigram Logistic Regression F1 Score: ", f1_score(y_test, y_pred_trigram_logistic))

trigram Logistic Regression Accuracy:  0.8114224937544832
trigram Logistic Regression F1 Score:  0.7147133662625356


### Trigrams Tuned

Applying GridSearchCV on Trigrams model to get the best set of parameters

In [21]:
trigramLogisticRegressor = SGDClassifier(
                            loss='log_loss',
                            penalty='l2',
                            max_iter=1000,
                            learning_rate='optimal',
                            n_jobs=-1,
                            random_state=42)
parameters = dict({
                'alpha':[0.01, 0.001, 0.0001, 0.00001, 0.000001],
                'n_iter_no_change': [5, 10, 15, 20]
            })
cv_stratified_splitter = StratifiedKFold(n_splits=5)
grid_search = GridSearchCV(trigramLogisticRegressor,
                            parameters,
                            cv=cv_stratified_splitter,
                            scoring=['accuracy', 'f1'],
                            n_jobs=-1,
                            refit='f1')
grid_search.fit(sparse_vstack([X_train_trigram, X_val_trigram]), np.concatenate((y_train, y_val)))


In [22]:
gridSearchCVResults = pd.DataFrame.from_dict(grid_search.cv_results_).reset_index()
gridSearchCVResults

Unnamed: 0,index,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_n_iter_no_change,params,split0_test_accuracy,split1_test_accuracy,...,std_test_accuracy,rank_test_accuracy,split0_test_f1,split1_test_f1,split2_test_f1,split3_test_f1,split4_test_f1,mean_test_f1,std_test_f1,rank_test_f1
0,0,3.879385,0.459306,0.134134,0.033087,0.01,5,"{'alpha': 0.01, 'n_iter_no_change': 5}",0.692139,0.691969,...,0.000606,17,0.352822,0.351276,0.352741,0.354524,0.356614,0.353595,0.001826,17
1,1,5.855093,0.848572,0.126336,0.04483,0.01,10,"{'alpha': 0.01, 'n_iter_no_change': 10}",0.69119,0.691969,...,0.000497,18,0.347076,0.351426,0.348466,0.349686,0.349993,0.349329,0.001469,18
2,2,7.902053,0.795376,0.127562,0.051808,0.01,15,"{'alpha': 0.01, 'n_iter_no_change': 15}",0.691135,0.691461,...,0.000455,19,0.346618,0.348641,0.347652,0.350843,0.35041,0.348833,0.001604,19
3,3,10.410401,0.636237,0.12332,0.03066,0.01,20,"{'alpha': 0.01, 'n_iter_no_change': 20}",0.690902,0.690843,...,0.000534,20,0.3458,0.345646,0.346039,0.347637,0.349849,0.346994,0.001595,20
4,4,3.613782,0.221458,0.133971,0.0431,0.001,5,"{'alpha': 0.001, 'n_iter_no_change': 5}",0.741415,0.742799,...,0.001531,14,0.546314,0.550731,0.547281,0.547302,0.555928,0.549511,0.003542,14
5,5,6.691494,0.927262,0.108208,0.009756,0.001,10,"{'alpha': 0.001, 'n_iter_no_change': 10}",0.742473,0.743624,...,0.001604,13,0.552838,0.556409,0.547063,0.549417,0.549641,0.551074,0.003239,13
6,6,8.657529,0.968323,0.113325,0.019544,0.001,15,"{'alpha': 0.001, 'n_iter_no_change': 15}",0.741058,0.743115,...,0.001495,15,0.545117,0.551423,0.546319,0.545826,0.550921,0.547921,0.002687,15
7,7,10.568434,0.749467,0.139175,0.047242,0.001,20,"{'alpha': 0.001, 'n_iter_no_change': 20}",0.741236,0.741769,...,0.00146,16,0.546055,0.544458,0.538981,0.542346,0.547395,0.543847,0.002959,16
8,8,6.176074,1.060793,0.10791,0.010345,0.0001,5,"{'alpha': 0.0001, 'n_iter_no_change': 5}",0.780647,0.782856,...,0.001322,9,0.652063,0.650661,0.650521,0.648484,0.657971,0.65194,0.003224,10
9,9,7.81503,1.026506,0.105224,0.006787,0.0001,10,"{'alpha': 0.0001, 'n_iter_no_change': 10}",0.780564,0.782705,...,0.001479,12,0.650684,0.653535,0.643106,0.640263,0.642371,0.645992,0.005161,12


<u>Best Parameters</u>

In [23]:
print("Best Parameters: ", grid_search.best_params_)

Best Parameters:  {'alpha': 1e-06, 'n_iter_no_change': 20}


In [24]:
bestAlpha = grid_search.best_params_['alpha']
bestNIterNoChange = grid_search.best_params_['n_iter_no_change']
trigramTunedLogisticRegressor = SGDClassifier(
                            loss='log_loss',
                            alpha=bestAlpha,
                            penalty='l2',
                            max_iter=1000,
                            learning_rate='optimal',
                            n_iter_no_change=bestNIterNoChange,
                            n_jobs=-1,
                            random_state=42)
trigramTunedLogisticRegressor.fit(X_train_trigram, y_train)
y_pred_trigram_tuned_logistic = trigramTunedLogisticRegressor.predict(X_test_trigram)
print("trigram Tuned Logistic Regression Accuracy: ", 100*accuracy_score(y_test, y_pred_trigram_tuned_logistic))
print("trigram Tuned Logistic Regression F1 Score: ", 100*f1_score(y_test, y_pred_trigram_tuned_logistic))


trigram Tuned Logistic Regression Accuracy:  80.23448514680057
trigram Tuned Logistic Regression F1 Score:  72.33895254249022
