# QUORA QUESTION PAIR SIMILARITY

## Finding similarity using methods like   Logistic Regression , xgboost

In [4]:
import pandas as pd
import numpy as np
import re
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier
from sklearn import linear_model
from sklearn.metrics import log_loss
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix  
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

## Reading the training file

In [5]:
df=pd.read_csv("train.csv")
df.shape

(404290, 6)

In [6]:
df=df.dropna()
df['is_duplicate'].value_counts() 

0    255024
1    149263
Name: is_duplicate, dtype: int64

## Preprocess data

In [7]:
replace_dict={"what's" : "what is" , "\'s" : " " , "\'ve" : " have " , "can't" : "can not " , "n't" : " not " , "i'm" : "i am " , "\'re" : " are " , "\'d" : " would " , "\'ll" : " will " , "\'scuse" : " excuse", '\'' : " ", ',' : " ", '&' : " ", '#' : " ", '%' : " ", "*" : " ", "(" : " ", ")" : " ", "!" : " "}
def clean_text(text):
    try:
        text = text.lower()
        for key,values in replace_dict.items():
            text=text.replace(key,values)
        text = re.sub('\W', ' ', text)
        text = re.sub('\s+', ' ', text)
        text = text.strip(' ')
    except Exception as e:
        print(text)
    return text

In [8]:
df['question1']=df['question1'].map(lambda x : clean_text(x))
df['question2']=df['question2'].map(lambda x : clean_text(x))

# Vectorize the data

In [9]:
tfidf_vectorizer = TfidfVectorizer()

In [10]:
tfidf_vectorizer.fit(pd.concat((df['question1'],df['question2'])).unique())

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [11]:
import scipy
trainq1_trans = tfidf_vectorizer.transform(df['question1'].values)
trainq2_trans = tfidf_vectorizer.transform(df['question2'].values)
labels = df['is_duplicate'].values
X = scipy.sparse.hstack((trainq1_trans,trainq2_trans))
y = labels
X_train,X_valid,y_train,y_valid = train_test_split(X,y, test_size = 0.33, random_state = 42)

### Model using LogisticRegression

In [16]:
model = linear_model.LogisticRegression(n_jobs=4,solver = 'sag',max_iter = 1000)        
model.fit(X_train,y_train)
print('predicting ...')
y_pred = model.predict(X_valid)
loss = log_loss(y_valid,y_pred)
print('log_loss= {}'.format(loss))
accuracy = accuracy_score(y_valid,y_pred)
print('accuracy= {}'.format(accuracy))
auc = roc_auc_score(y_valid,y_pred)
print('auc  = {}'.format(auc))

predicting ...
log_loss= 8.30060910565358
accuracy= 0.7596746992467114
auc  = 0.7214081188494489


### Model using xboost

In [17]:
dtrain = xgb.DMatrix(X_train,label = y_train)
dvalid = xgb.DMatrix(X_valid,label = y_valid)
param = {'max_depth':50, 'eta':0.3, 'silent':1, 'objective':'binary:logistic','subsample':0.8,'gamma':0 }
param['nthread'] = 7   
param['eval_metric'] = 'auc'
num_round = 100
evallist  = [(dvalid,'eval'),(dtrain,'train')]
bst = xgb.train(param, dtrain, num_round, evallist)
y_pred = bst.predict(dvalid)
loss = log_loss(y_valid,y_pred)
print('log_loss= {}'.format(loss))

[0]	eval-auc:0.75712	train-auc:0.872221
[1]	eval-auc:0.784094	train-auc:0.90611
[2]	eval-auc:0.800946	train-auc:0.928553
[3]	eval-auc:0.813536	train-auc:0.945294
[4]	eval-auc:0.820934	train-auc:0.953972
[5]	eval-auc:0.825845	train-auc:0.961826
[6]	eval-auc:0.829639	train-auc:0.965265
[7]	eval-auc:0.833585	train-auc:0.97054
[8]	eval-auc:0.837491	train-auc:0.974494
[9]	eval-auc:0.839403	train-auc:0.97567
[10]	eval-auc:0.840988	train-auc:0.976811
[11]	eval-auc:0.842983	train-auc:0.97811
[12]	eval-auc:0.844122	train-auc:0.978942
[13]	eval-auc:0.845806	train-auc:0.979962
[14]	eval-auc:0.846923	train-auc:0.980442
[15]	eval-auc:0.848045	train-auc:0.98122
[16]	eval-auc:0.848928	train-auc:0.981803
[17]	eval-auc:0.849865	train-auc:0.982388
[18]	eval-auc:0.850661	train-auc:0.982764
[19]	eval-auc:0.851406	train-auc:0.983325
[20]	eval-auc:0.852749	train-auc:0.984482
[21]	eval-auc:0.853486	train-auc:0.984987
[22]	eval-auc:0.854138	train-auc:0.985263
[23]	eval-auc:0.854641	train-auc:0.985543
[24]	eva