In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pd.options.display.max_colwidth = None
pd.options.display.max_rows = 100

In [None]:
!pip install -U sentence-transformers


In [None]:
from sentence_transformers import SentenceTransformer
from sentence_transformers import SentenceTransformer, SentencesDataset, InputExample, losses, evaluation, util
from torch.utils.data import DataLoader
import gc

import scipy
from sklearn.metrics import accuracy_score


In [None]:
model = SentenceTransformer('/kaggle/input/model-weights-sbert-trained-on-these-data/model_mnli/model_mnli/')

#model = SentenceTransformer('/kaggle/input/distilbertbasenlistsbmeantokens/distilbert-base-nli-stsb-mean-tokens/')



In [None]:
model.encode("hi").shape

In [None]:
train = pd.read_csv("/kaggle/input/quora-question-pairs/train.csv.zip")
print(train.shape)
#train['is_duplicate'] = train['is_duplicate'].replace(0,-1)

train.head(30)


In [None]:
num_train_examples = 121600
num_test_examples = 3200
distance_metric = "cosine_distance"
num_epochs = 2
batch_size = 32

In [None]:
#help(model.fit)

In [None]:
train['question1'][0]

In [None]:
train_samples = []
for row in range(num_train_examples):
    sample = InputExample(texts=[str(train['question1'][row]), str(train['question2'][row])], 
                          label=int(train['is_duplicate'][row]))
    train_samples.append(sample)

train_dataset = SentencesDataset(train_samples, model=model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)

train_loss = losses.OnlineContrastiveLoss(model=model, margin=0.5)

In [None]:
test_samples = num_test_examples

sentences1 = list(train['question1'][-1*test_samples:])
sentences2 = list(train['question2'][-1*test_samples:])
scores =  list(train['is_duplicate'][-1*test_samples:].astype('int'))

evaluator1 = evaluation.EmbeddingSimilarityEvaluator(sentences1, sentences2, scores)
evaluator2 = evaluation.BinaryClassificationEvaluator(sentences1, sentences2, scores)

# ... Your other code to load training data



In [None]:
#model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=50, evaluator=evaluator2)

In [None]:
from sklearn.metrics import confusion_matrix
def thr_to_accuracy(thr, Y_test, predictions):
    return -accuracy_score(Y_test, np.array(predictions>thr, dtype=np.int))


In [None]:
# COSINE SIM before training

#Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)
#Compute cosine-similarits
cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)

print(pd.DataFrame({"labels": scores, 'cosine_sim': np.diag(cosine_scores).tolist()}).groupby("labels").agg({"cosine_sim":["count","mean"]}))

best_thr = scipy.optimize.fmin(thr_to_accuracy, args=(scores, np.diag(cosine_scores)), x0=0.5)
print(best_thr)

print("\n Confusion matrix")
# y_pred=np.array(np.diag(cosine_scores) > best_thr).astype("int").tolist()
print(confusion_matrix(y_true = scores, y_pred = np.array(np.diag(cosine_scores) > best_thr).astype("int").tolist() ))

pred_before = np.array(np.diag(cosine_scores) > best_thr).astype("int").tolist()

In [None]:
del train
gc.collect()
#np.diag(cosine_scores)

In [None]:
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs= num_epochs, warmup_steps=100, evaluator=evaluator2)

In [None]:
# COSINE SIM After training
#Compute embedding for both lists
embeddings1 = model.encode(sentences1, convert_to_tensor=True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)
#Compute cosine-similarits
cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)
pd.DataFrame({"labels": scores, 'cosine_sim': np.diag(cosine_scores).tolist()}).groupby("labels").agg({"cosine_sim":["count","mean"]})

best_thr = scipy.optimize.fmin(thr_to_accuracy, args=(scores, np.diag(cosine_scores)), x0=0.5)
print(best_thr)
print("\n Confusion matrix")
print(confusion_matrix(y_true = scores, y_pred = np.array(np.diag(cosine_scores) > best_thr).astype("int").tolist() ))

pred_aft = np.array(np.diag(cosine_scores) > best_thr).astype("int").tolist()

In [None]:
from sklearn import metrics

In [None]:
scores

In [None]:
pred_aft

In [None]:
print("log_loss:", metrics.log_loss(y_true = scores, y_pred = pred_aft, eps=1e-15))

In [None]:
# y_pred = tf.maximum(pred_aft, 1e-15)
# # Compute the log loss
# log_loss = -tf.reduce_sum(y_true * tf.log(y_pred), axis=-1)
# print(log_loss )

In [None]:
print("classification_report:", metrics.classification_report(y_true = scores, y_pred = np.array(np.diag(cosine_scores) > best_thr).astype("int").tolist() ))

In [None]:
from sklearn.metrics import log_loss

In [None]:
print("Log loss on Test Data using Random Model",log_loss(scores, np.diag(cosine_scores), eps=1e-15))

In [None]:
print("The test log loss is:",log_loss(scores, pred_aft, labels=clf.classes_, eps=1e-15))

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.metrics import confusion_matrix

def plot_cm(y_true, y_pred, figsize=(10,10)):
    cm = confusion_matrix(y_true, y_pred, labels=np.unique(y_true))
    cm_sum = np.sum(cm, axis=1, keepdims=True)
    cm_perc = cm / cm_sum.astype(float) * 100
    annot = np.empty_like(cm).astype(str)
    nrows, ncols = cm.shape
    for i in range(nrows):
        for j in range(ncols):
            c = cm[i, j]
            p = cm_perc[i, j]
            if i == j:
                s = cm_sum[i]
                annot[i, j] = '%.1f%%\n%d/%d' % (p, c, s)
            elif c == 0:
                annot[i, j] = ''
            else:
                annot[i, j] = '%.1f%%\n%d' % (p, c)
    cm = pd.DataFrame(cm, index=np.unique(y_true), columns=np.unique(y_true))
    cm.index.name = 'Actual'
    cm.columns.name = 'Predicted'
    fig, ax = plt.subplots(figsize=figsize)
    sns.heatmap(cm, cmap= "YlGnBu", annot=annot, fmt='', ax=ax)
    
plot_cm(scores, y_pred = np.array(np.diag(cosine_scores) > best_thr).astype("int").tolist())

In [None]:
embeddings1.shape

In [None]:
embeddings2.shape

In [None]:
## Error Analysis

pred_df = pd.DataFrame({  "sentences1":sentences1,
                          "sentences2":sentences2,
                          "y_true":scores,
                          "y_pred_before":pred_before,
                          "y_pred_after":pred_aft})

In [None]:
print(pred_df[(pred_df.y_true != pred_df.y_pred_before) & 
              (pred_df.y_true == pred_df.y_pred_after) ].reset_index(drop=True).head(50).T)

In [None]:
print(pred_df[(pred_df.y_true != pred_df.y_pred_before) & 
              (pred_df.y_true != pred_df.y_pred_after) ].reset_index(drop=True).head(50).T)

In [None]:
q1="which is the best mobile phone under 20000rs"
q2="which mobile is the best between 10000rs and 20000rs"

In [None]:
q1_p=[]
q1_p.append(q1)
q2_p=[]
q2_p.append(q2)

In [None]:
embeddings_1 = model.encode(q1_p, convert_to_tensor=True)
embeddings_2 = model.encode(q2_p, convert_to_tensor=True)

In [None]:
cosine_scores = util.pytorch_cos_sim(embeddings_1, embeddings_2)

In [None]:
print(cosine_scores)

In [None]:
q1="How to toast a bread"
q2="What is the procedure to make a bread toast"
print(predict(q1,q2))

In [None]:
q1="which is the best mobile phone under 20000rs"
q2="which is the best phone between 10000rs and 20000rs"
print(predict(q1,q2))

In [None]:
q1="where is the india gate located"
q2="where is taj mahal located"
print(predict(q1,q2))

In [None]:
embeddings_1.shape

In [None]:
embeddings_2.shape

In [None]:
model.save('../working')

In [None]:
os.listdir('../working')

In [None]:
import numpy as np 
import pandas as pd 
import nltk
import matplotlib.pyplot as plt

In [None]:
train=pd.read_csv('/kaggle/input/train2/train (2).csv', nrows=1000)

In [None]:
train.head(10)

In [None]:
nltk.download('stopwords')

In [None]:
#Converting every character to lower case
docs=train['question_text'].str.lower()
print(docs.head())
print('\n')

#Remove non-alphabets
docs.str.replace('[^a-z ]','')
print(docs.head())
print('\n')

#Remove commonly used words
from nltk.corpus import stopwords
stopwords=nltk.corpus.stopwords.words('english')
stemmer=nltk.stem.PorterStemmer()
print(stopwords)
print('\n')

def clean_sentence(doc):
    words=doc.split(' ')
    words_clean=[stemmer.stem(word) for word in words if word not in stopwords]
    return ' '.join(words_clean)
    print(words_clean)
    
docs=docs.apply(clean_sentence)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

dtm_vectorizer = CountVectorizer()


train_x,validate_x, train_y,validate_y = train_test_split(docs, train['target'], test_size = 0.2, random_state = 1)
dtm_vectorizer.fit(train_x)
dtm_train = dtm_vectorizer.transform(train_x)
dtm_validate = dtm_vectorizer.transform(validate_x)

In [None]:
print(train_x)

In [None]:
df_dtm_train = pd.DataFrame(dtm_train.toarray(),columns=dtm_vectorizer.get_feature_names(),index=train_x.index)
df_dtm_train

In [None]:
df_dtm_train.sum().sort_values(ascending=False).head(20).plot.bar()

In [None]:
from sklearn.naive_bayes import MultinomialNB
model2=MultinomialNB().fit(dtm_train,train_y)
train_y_pred=model2.predict(dtm_validate)

from sklearn.metrics import accuracy_score,f1_score
print(accuracy_score(validate_y,train_y_pred))
print(f1_score(validate_y,train_y_pred))

In [None]:
a=["How do I marry an American woman for a Green Card? How much do they charge?"]
ta = dtm_vectorizer.transform(a)
t=model2.predict(ta)
if t==1:
    print("Insincere question")
else:
    print("Sincere question")

In [None]:
a=["if I'm creating an app using multiple programming languages, can I use the same IDE?"]
ta = dtm_vectorizer.transform(a)
t=model2.predict(ta)
if t==1:
    print("Insincere question")
else:
    print("Sincere question")

In [None]:
def predict(q1,q2):
    q1_p=[]
    q1_p.append(q1)
    q2_p=[]
    q2_p.append(q2)
    t1=dtm_vectorizer.transform(q1_p)
    t_1=model2.predict(t1)
    t2=dtm_vectorizer.transform(q2_p)
    t_2=model2.predict(t2)
    if t_1==1:
        print("Question 1 is insincere")
    else:
        print("Question 1 is sincere")
    if t_2==1:
        print("Question 2 is insincere")
    else:
        print("Question 2 is sincere")
    embeddings_1 = model.encode(q1_p, convert_to_tensor=True)
    embeddings_2 = model.encode(q2_p, convert_to_tensor=True)
    cosine_scores = util.pytorch_cos_sim(embeddings_1, embeddings_2)
    if(cosine_scores>0.81):
        return "Similar Questions"
    else:
        return "Different Questions"    

In [None]:
q1="which is the cheapest flight to chennai "
q2="which is the cheapest flight to delhi"
ans=predict(q1,q2)
print(ans)

In [None]:
q1="How do I marry an American woman for a Green Card? How much do they charge?"
q2="What are the different ways to get a green card in america?"
ans=predict(q1,q2)
print(ans)

In [None]:
q1="which is the best mobile phone under 20000rs"
q2="which is the best phone between 10000rs and 20000rs"
print(predict(q1,q2))

In [None]:
q1="where is the india gate located"
q2="where is taj mahal located"
print(predict(q1,q2))

In [None]:
q1="How to toast a bread"
q2="What is the procedure to make a bread toast"
print(predict(q1,q2))