In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input/quora-question-pairs"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

In [None]:
from nltk.corpus import stopwords
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn import svm
%matplotlib inline

In [None]:
data = pd.read_csv('../input/quora-question-pairs/train.csv')
train_data = data.drop('is_duplicate', axis=1)
label_data = data['is_duplicate']

In [None]:
stops = set(stopwords.words("english"))

In [None]:
full_stop = [',','.','?','!',';']

In [None]:
label_data.head(10)

In [None]:
def sent_to_word(row):
    sen_split = row.split(' ')
    return [x.lower() for x in sen_split if ]
def sent_len(row):
    return len(sent_to_word(row))
def overlap_words(row):
    inter_set = set(row.word_list1).intersection(set(row.word_list2))
    stops = set(stopwords.words("english"))
    filtered_set = [word for word in inter_set if word not in stops]
    return filtered_set

In [None]:
def make_input(train_data):
    train_data['sent_diff'] = train_data.question1.astype(str).apply(sent_len) - train_data.question2.astype(str).apply(sent_len)
    train_data['word_list1'] = train_data.question1.apply(sent_to_word)
    train_data['word_list2'] = train_data.question2.astype(str).apply(sent_to_word)
    train_data['overlap'] = train_data.apply(overlap_words, axis=1)
    train_data['len_overlap'] = train_data['overlap'].apply(len)
    train_data['len_over_ratio'] = train_data['len_overlap']/(train_data['word_list1'].apply(len)+train_data['word_list2'].apply(len))
    train_data['words'] = train_data['word_list1'] + train_data['word_list2']
    return train_data

    

In [None]:
train_data = make_input(train_data)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
class Make_input(BaseEstimator, TransformerMixin):
    def __init__(self, words_pool={}):
        self.words_pool = words_pool
   
    def fit(self, train_data, y=None):
    
        def weight(row):
            w = 0
            for word in row:
                w += 1/self.words_pool[word]
            return w
        
        for i in range(train_data.shape[0]):
            for word in train_data.loc[i, 'words']:
                if word not in stops:
                    if word not in self.words_pool.keys():
                        self.words_pool[word] = 1
                    else:
                        self.words_pool[word] += 1
        train_data['weight'] = train_data['overlap'].apply(weight)
        train_data = train_data.ix[:,['sent_diff','len_over_ratio','weight']]
        return self
    
    def transform(self, valid_data):
        
        def weight_transform(row):
            w = 0
            for word in row:
                if word in self.words_pool.keys():
                    w += 1/self.words_pool[word]
                else:
                    self.words_pool[word] = 1
            return w
        
        valid_data['weight'] = valid_data['overlap'].apply(weight_transform)
        valid_data = valid_data.ix[:,['sent_diff','len_over_ratio','weight']]
        return valid_data

In [None]:
from sklearn.metrics import log_loss


In [None]:
X_train, y_train, X_test, y_test = train_data.loc[:35000,:], label_data[:35001], train_data.loc[35001:,:], label_data[35001:]

In [None]:
prework = Make_input()
X_train = prework.fit_transform(X_train)
X_test = prework.transform(X_test)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
svm_classifier = svm.SVC(C=1, probability=True)
svm_classifier.fit(X_train, y_train)


In [None]:
pred = svm_classifier.predict_proba(X_test)
metric_1 = log_loss(y_test, pred)
print (metric_1)

In [None]:
y_train.head()

In [None]:
clf = make_pipeline(Make_input(), StandardScaler())
X_train = clf.fit_transform(X_train)

In [None]:
estimator = Make_input()
trai = estimator.fit_transform(train_data)
#vali = estimator.transform(train_data.loc[20:40,:])
#print (trai, vali)

In [None]:
scaler = StandardScaler()
trai = scaler.fit_transform(trai)
vali = scaler.transform(vali)
print (trai, vali)

In [None]:
svm_classifier = svm.SVC(C=1, probability=True)
svm_classifier.fit(trai, label_data[:21])
pred = svm_classifier.predict_proba(vali)
print (pred)

In [None]:
cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=0)
cross_val_score(estimator, train_data.loc[:20,:], scoring='neg_log_loss', cv=cv)

In [None]:
label_data[:19].shape

In [None]:
train_data['len_overlap'] = train_data['overlap'].apply(len)

In [None]:
train_data['len_over_ratio'] = train_data['len_overlap']/(train_data['word_list1'].apply(len)+train_data['word_list2'].apply(len))

In [None]:
train_data['words'] = train_data['word_list1'] + train_data['word_list2']

In [None]:
train_data['words'] = train_data['word_list1'] + train_data['word_list2']
words_pool = {}
for i in range(train_data.shape[0]):
    for word in train_data.loc[i, 'words']:
        if word not in stops:
            if word not in words_pool.keys():
                words_pool[word] = 0
            else:
                words_pool[word] += 1
train_data['weight'] = train_data['overlap'].apply(weight)
model_input = train_data.ix[:,['sent_diff','len_over_ratio','weight']]
model_label = train_data.ix[:,'is_duplicate']

In [None]:
train_data['weight'] = train_data['overlap'].apply(weight)

In [None]:
model_input = train_data.ix[:,['sent_diff','len_over_ratio','weight']]
model_label = train_data.ix[:,'is_duplicate']

In [None]:
input_scaler = StandardScaler()
stand_input = input_scaler.fit_transform(model_input)