In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from difflib import SequenceMatcher


from nltk.corpus import stopwords
stops = set(stopwords.words("english"))

In [None]:
## Training set
train_df = pd.read_csv('../input/train.csv', nrows=1000)

## Test Set",
test_df = pd.read_csv('../input/test.csv', nrows=1000)

train_df.head()

In [None]:
def word_match_share(row):
    q1words = {}
    q2words = {}
    for word in str(row['question1']).lower().split():
        if word not in stops:
            q1words[word] = 1
    for word in str(row['question2']).lower().split():
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return R

In [None]:
from collections import Counter
# If a word appears only once, we ignore it completely (likely a typo)
# Epsilon defines a smoothing constant, which makes the effect of extremely rare words smaller

def get_weight(count, eps=10000, min_count=2):
    if count < min_count:
        return 0
    else:
        return 1 / (count + eps)

eps = 5000 
train_qs = pd.Series(train_df['question1'].tolist() + train_df['question2'].tolist()).astype(str)
words = (" ".join(train_qs)).lower().split()
counts = Counter(words)
weights = {word: get_weight(count) for word, count in counts.items()}

In [None]:
def tfidf_word_match_share(row):
    q1words = {}
    q2words = {}
    for word in str(row['question1']).lower().split():
        if word not in stops:
            q1words[word] = 1
    for word in str(row['question2']).lower().split():
        if word not in stops:
            q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        # The computer-generated chaff includes a few questions that are nothing but stopwords
        return 0
    
    shared_weights = [weights.get(w, 0) for w in q1words.keys() if w in q2words] + [weights.get(w, 0) for w in q2words.keys() if w in q1words]
    total_weights = [weights.get(w, 0) for w in q1words] + [weights.get(w, 0) for w in q2words]
    
    R = np.sum(shared_weights) / np.sum(total_weights)
    return R

In [None]:
def stop_ratio(question):
    q = set(question)
    if len(q) == 0:
        return 0
    qwords = q.difference(stops)
    qstops = q.intersection(stops)
    return len(qstops) / len(q)

In [None]:
def uniq1_ratio(row):
    uniq_1 = set(row["question1"].lower().replace(" ",""))
    uniq_2 = set(row["question2"].lower().replace(" ",""))
    return len(uniq_1) / len(uniq_1 | uniq_2)

def uniq2_ratio(row):
    uniq_1 = set(row["question1"].lower().replace(" ",""))
    uniq_2 = set(row["question2"].lower().replace(" ",""))
    return len(uniq_2) / len(uniq_1 | uniq_2)

In [None]:
def create_features(df):
    df["question1"].fillna("", inplace=True)
    df["question2"].fillna("", inplace=True)

    df["question1"] = df["question1"].apply(str)
    df["question2"] = df["question2"].apply(str)
    
    print("len")
    df["q1_len"] = df["question1"].apply(len)
    df["q2_len"] = df["question1"].apply(len)
    df["diff_len"] = abs(df["q1_len"] - df["q2_len"])
    
    print("len word")
    df["q1_len_word"] = df["question1"].apply(lambda x: len(x.split()))
    df["q2_len_word"] = df["question1"].apply(lambda x: len(x.split()))
    df["diff_len_word"] = abs(df["q1_len_word"] - df["q2_len_word"])
    
    print("avg len word")
    df['q1_avg_len_word'] = df['q1_len'] / df['q1_len_word']
    df['q2_avg_len_word'] = df['q2_len'] / df['q2_len_word']
    df['diff_avg_len_word'] = abs(df['q1_avg_len_word'] - df['q2_avg_len_word'])
    
    print("unique char")
    df["q1_n_uniquechar"] = df["question1"].apply(lambda x: len("".join(set(x.replace(" ","")))))
    df["q2_n_uniquechar"] = df["question2"].apply(lambda x: len("".join(set(x.replace(" ","")))))
    df["diff_n_uniquechar"] = abs(df["q1_n_uniquechar"] - df["q2_n_uniquechar"])

    print("W word")
    df["q1_how"]   = df["question1"].apply(lambda x : "how"   in x.lower())
    df["q1_who"]   = df["question1"].apply(lambda x : "who"   in x.lower())
    df["q1_why"]   = df["question1"].apply(lambda x : "why"   in x.lower())
    df["q1_what"]  = df["question1"].apply(lambda x : "what"  in x.lower())
    df["q1_where"] = df["question1"].apply(lambda x : "where" in x.lower())
    df["q1_which"] = df["question1"].apply(lambda x : "which" in x.lower())

    df["q2_how"]   = df["question2"].apply(lambda x : "how"   in x.lower())
    df["q2_who"]   = df["question2"].apply(lambda x : "who"   in x.lower())
    df["q2_why"]   = df["question2"].apply(lambda x : "why"   in x.lower())
    df["q2_what"]  = df["question2"].apply(lambda x : "what"  in x.lower())
    df["q2_where"] = df["question2"].apply(lambda x : "where" in x.lower())
    df["q2_which"] = df["question2"].apply(lambda x : "which" in x.lower())
    
    df["q1q2_how"]   = df["q1_how"]   == df["q2_how"]
    df["q1q2_who"]   = df["q1_who"]   == df["q2_who"]
    df["q1q2_why"]   = df["q1_why"]   == df["q2_why"]
    df["q1q2_what"]  = df["q1_what"]  == df["q2_what"]
    df["q1q2_where"] = df["q1_where"] == df["q2_where"]
    df["q1q2_which"] = df["q1_which"] == df["q2_which"]
    
    print("stop ratio")
    df["q1_stop_ratio"] = df["question1"].apply(stop_ratio)
    df["q2_stop_ratio"] = df["question2"].apply(stop_ratio)
    df["diff_stop_ratio"] = abs(df["q1_stop_ratio"] - df["q2_stop_ratio"])

    print("math")
    df["q1_math"] = df["question1"].apply(lambda x: '[math]' in x)
    df["q2_math"] = df["question2"].apply(lambda x: '[math]' in x)
    df["q1q2_math"] = df["q1_math"] == df["q2_math"]
    
    print("nqmark")
    df["q1_nqmark"] = df["question1"].apply(lambda x: x.count('?'))
    df["q2_nqmark"] = df["question2"].apply(lambda x: x.count('?'))
    df["diff_nqmark"] = abs(df["q1_nqmark"] - df["q2_nqmark"])
    
    print("nperiod")
    df["q1_nperiod"] = df["question1"].apply(lambda x: x.count('.'))
    df["q2_nperiod"] = df["question2"].apply(lambda x: x.count('.'))
    df["diff_nperiod"] = abs(df["q1_nperiod"] - df["q2_nperiod"])

    print("capitalfirst")
    df["q1_capitalfirst"] = df["question1"].apply(lambda x: x[0].isupper() if len(x) > 0 else False)
    df["q2_capitalfirst"] = df["question2"].apply(lambda x: x[0].isupper() if len(x) > 0 else False)
    df["q1q2_capitalfirst"] = df["q1_capitalfirst"] == df["q2_capitalfirst"]

    print("has capital")
    df["q1_has_capital"] = df["question1"].apply(lambda x: any([l.isupper() for l in x]))
    df["q2_has_capital"] = df["question2"].apply(lambda x: any([l.isupper() for l in x]))
    df["q1q2_has_capital"] = df["q1_has_capital"] == df["q2_has_capital"]

    print("n capitals")
    df["q1_n_capitals"] = df["question1"].apply(lambda x: sum([1 for c in x if c.isupper()]))
    df["q2_n_capitals"] = df["question2"].apply(lambda x: sum([1 for c in x if c.isupper()]))
    df["diff_n_capitals"] = abs(df["q1_n_capitals"] - df["q2_n_capitals"])
    
    print("is identical")
    df["is_identical"] = (df["question1"].apply(lambda x: x.lower()) == df["question2"].apply(lambda x: x.lower()))    

    print("unique ratio")
    df["q1_unique_ratio"] = df.apply(uniq1_ratio ,axis=1)
    df["q2_unique_ratio"] = df.apply(uniq2_ratio ,axis=1)

    #df["similarity_prob"] = df.apply(lambda row: SequenceMatcher(None, row["question1"],row["question2"]).ratio(),axis=1)
    
    print("text prop")
    df["q1_isalnum"]   =  df["question1"].apply(lambda x: x.isalnum())
    df["q1_isalpha"]   =  df["question1"].apply(lambda x: x.isalpha())
    df["q1_isdecimal"] =  df["question1"].apply(lambda x: x.isdecimal())
    df["q1_isdigit"]   =  df["question1"].apply(lambda x: x.isdigit())
    df["q1_islower"]   =  df["question1"].apply(lambda x: x.islower())
    df["q1_isnumeric"] =  df["question1"].apply(lambda x: x.isnumeric())
    df["q1_isspace"]   =  df["question1"].apply(lambda x: x.isspace())
    df["q1_isupper"]   =  df["question1"].apply(lambda x: x.isupper())
    
    df["q2_isalnum"]   =  df["question2"].apply(lambda x: x.isalnum())
    df["q2_isalpha"]   =  df["question2"].apply(lambda x: x.isalpha())
    df["q2_isdecimal"] =  df["question2"].apply(lambda x: x.isdecimal())
    df["q2_isdigit"]   =  df["question2"].apply(lambda x: x.isdigit())
    df["q2_islower"]   =  df["question2"].apply(lambda x: x.islower())
    df["q2_isnumeric"] =  df["question2"].apply(lambda x: x.isnumeric())
    df["q2_isspace"]   =  df["question2"].apply(lambda x: x.isspace())
    df["q2_isupper"]   =  df["question2"].apply(lambda x: x.isupper())
    
    df["q1q2_isalnum"]   = df["q1_isalnum"]   == df["q2_isalnum"]
    df["q1q2_isalpha"]   = df["q1_isalpha"]   == df["q2_isalpha"]
    df["q1q2_isdecimal"] = df["q1_isdecimal"] == df["q2_isdecimal"]
    df["q1q2_isdigit"]   = df["q1_isdigit"]   == df["q2_isdigit"]
    df["q1q2_islower"]   = df["q1_islower"]   == df["q2_islower"]
    df["q1q2_isnumeric"] = df["q1_isnumeric"] == df["q2_isnumeric"]
    df["q1q2_isspace"]   = df["q1_isspace"]   == df["q2_isspace"]
    df["q1q2_isupper"]   = df["q1_isupper"]   == df["q2_isupper"]
    
    print("word share")
    df["word_match_share"] = df.apply(lambda row: word_match_share(row), axis=1)
    df["tfidf_word_match_share"] = df.apply(lambda row: tfidf_word_match_share(row), axis=1)

In [None]:
plt.figure()
plt.hist(train_df[train_df["is_duplicate"]==0]["tfidf_word_match_share"],bins=100,range=(0,1),alpha=0.5,normed=True)
plt.hist(train_df[train_df["is_duplicate"]==1]["tfidf_word_match_share"],bins=100,range=(0,1),alpha=0.5,normed=True)
plt.show()

In [None]:
# bag of letter sequences (chars)
BagOfWordsExtractor = CountVectorizer(max_df=0.999, min_df=1000, max_features=300, 
                                      analyzer='char', ngram_range=(1,2),
                                      binary=True, lowercase=True)

BagOfWordsExtractor.fit(train_qs.unique())

trainQuestion1_BOW_rep = BagOfWordsExtractor.transform(train_df.ix[:,'question1'])
trainQuestion2_BOW_rep = BagOfWordsExtractor.transform(train_df.ix[:,'question2'])

In [None]:
X = (trainQuestion1_BOW_rep + trainQuestion2_BOW_rep).astype(int)/2.
print(X[0].todense())
print(X.shape)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten, Merge, BatchNormalization
from keras.layers import Conv1D, MaxPooling1D, Conv2D, MaxPooling2D
from keras.callbacks import EarlyStopping
from keras.utils import np_utils
from keras.preprocessing.sequence import pad_sequences

In [None]:
model = Sequential()
model.add(Dense(X.shape[1], input_dim=X.shape[1]))
model.add(Dropout(0.25))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='adam')

history = model.fit(X.toarray(),train_df.is_duplicate.values,
                    batch_size=1,
                    epochs=20,
                    validation_split=0.2)
