In [3]:
import numpy as np
import pandas as pd
import networkx as nx
from matplotlib import pyplot as plt
import warnings
import traceback
TRAIN_PATH = "./input/train.csv"
TEST_PATH = "./input/test.csv"
train = pd.read_csv(TRAIN_PATH, encoding="ISO-8859-1")
test = pd.read_csv(TEST_PATH, encoding="ISO-8859-1")

In [4]:
g = nx.Graph()
g.add_nodes_from(train.question1)
g.add_nodes_from(train.question2)
g.add_nodes_from(test.question1)
g.add_nodes_from(test.question2)
edges = list(train[['question1', 'question2']].to_records(index=False)) + list(test[['question1', 'question2']].to_records(index=False))
g.add_edges_from(edges)
cc = nx.find_cliques(g)
len_node = {}
for each in cc:
    len_clique = len(each)
    for items in each:
        try:
            len_node[items].append(len_clique)
        except:
            len_node[items] = []
            len_node[items].append(len_clique)

In [5]:
def get_largest(row,key):
    word = row[key]
    nodes = len_node[word]
    return max(nodes)

In [6]:
def get_smallest(row,key):
    word = row[key]
    nodes = len_node[word]
    return min(nodes)

In [7]:
def get_avg(row,key):
    word = row[key]
    nodes = len_node[word]
    total = 0
    for items in nodes:
        total += items
    avg = total/len(nodes)
    return avg

In [8]:
train['q1_max'] = train.apply(get_largest,key='question1', axis=1, raw=True)
train['q2_max'] = train.apply(get_largest,key='question2', axis=1, raw=True)
train['q1_min'] = train.apply(get_smallest,key='question1', axis=1, raw=True)
train['q2_min'] = train.apply(get_smallest,key='question2', axis=1, raw=True)
train['q1_avg'] = train.apply(get_avg,key='question1', axis=1, raw=True)
train['q2_avg'] = train.apply(get_avg,key='question2', axis=1, raw=True)

In [9]:
test['q1_max'] = test.apply(get_largest,key='question1', axis=1, raw=True)
test['q2_max'] = test.apply(get_largest,key='question2', axis=1, raw=True)
test['q1_min'] = test.apply(get_smallest,key='question1', axis=1, raw=True)
test['q2_min'] = test.apply(get_smallest,key='question2', axis=1, raw=True)
test['q1_avg'] = test.apply(get_avg,key='question1', axis=1, raw=True)
test['q2_avg'] = test.apply(get_avg,key='question2', axis=1, raw=True)

In [10]:
g = nx.Graph()
g.add_nodes_from(train.question1)
g.add_nodes_from(train.question2)
g.add_nodes_from(test.question1)
g.add_nodes_from(test.question2)
edges = list(train[['question1', 'question2']].to_records(index=False)) + list(test[['question1', 'question2']].to_records(index=False))
g.add_edges_from(edges)
cc = nx.find_cliques(g)

In [11]:
all_cliques = {}

In [12]:
for each in cc:
    obj = tuple(each)
    for items in each:
        key = items
        try:
            all_cliques[key].add(obj)
        except:
            all_cliques[key] = set()
            all_cliques[key].add(obj)

In [13]:
def get_simultaneously_exist(row):
    q1 = row['question1']
    q2 = row['question2']
    cliques = all_cliques[q1]
    largest = 0
    for each in cliques:
        if q2 in each and len(each) > largest:
            largest = len(each)
    return largest

In [14]:
train['largest_simultaneously_exist'] = train.apply(get_simultaneously_exist, axis=1, raw=True)

In [15]:
test['largest_simultaneously_exist'] = test.apply(get_simultaneously_exist, axis=1, raw=True)

In [16]:
g = nx.Graph()
g.add_nodes_from(train.question1)
g.add_nodes_from(train.question2)
g.add_nodes_from(test.question1)
g.add_nodes_from(test.question2)
edges = list(train[['question1', 'question2']].to_records(index=False)) + list(test[['question1', 'question2']].to_records(index=False))
g.add_edges_from(edges)
cc = nx.find_cliques(g)
result = {}
for each in cc:
    for items in each:
        key = items
        value = hash(tuple(each))
        try:
            result[key].add(value)
        except:
            result[key] = set()
            result[key].add(value)

In [17]:
def get_question1_cliques(row):
    try:
        len_q1 = len(result[row['question1']])
    except:
        len_q1 = 0
    return len_q1

In [18]:
def get_question2_cliques(row):
    try:
        len_q2 = len(result[row['question2']])
    except:
        len_q2 = 0
    return len_q2

In [19]:
train['question1_cliques'] = train.apply(get_question1_cliques, axis=1, raw=True)
train['question2_cliques'] = train.apply(get_question2_cliques, axis=1, raw=True)

In [20]:
test['question1_cliques'] = test.apply(get_question1_cliques, axis=1, raw=True)
test['question2_cliques'] = test.apply(get_question2_cliques, axis=1, raw=True)

In [21]:
def get_larger_cliques_len(row):
    if row['q1_max'] > row['q2_max']:
        return row['q1_max']
    else:
        return row['q2_max']

In [22]:
def get_smaller_cliques_len(row):
    if row['q1_max'] > row['q2_max']:
        return row['q2_max']
    else:
        return row['q1_max']

In [23]:
train['larger_cliques_len'] = train.apply(get_larger_cliques_len, axis=1, raw=True)

In [24]:
train['smaller_cliques_len'] = train.apply(get_smaller_cliques_len, axis=1, raw=True)

In [25]:
test['larger_cliques_len'] = test.apply(get_larger_cliques_len, axis=1, raw=True)
test['smaller_cliques_len'] = test.apply(get_smaller_cliques_len, axis=1, raw=True)

In [28]:
train = train.iloc[:, 6:]

In [29]:
index = train.axes[1]

In [30]:
index

Index(['q1_max', 'q2_max', 'q1_min', 'q2_min', 'q1_avg', 'q2_avg',
       'largest_simultaneously_exist', 'question1_cliques',
       'question2_cliques', 'larger_cliques_len', 'smaller_cliques_len'],
      dtype='object')

In [34]:
result = pd.read_csv('./input/train.csv', encoding="ISO-8859-1")

In [35]:
from sklearn.metrics import roc_auc_score

In [36]:
for each in index:
    print(each)
    print(roc_auc_score(result['is_duplicate'],train[each]))

q1_max
0.789100218234
q2_max
0.781428188643
q1_min
0.716940029561
q2_min
0.717117626094
q1_avg
0.795252693248
q2_avg
0.792872918352
largest_simultaneously_exist
0.801928619629
question1_cliques
0.577371348117
question2_cliques
0.597784421232
larger_cliques_len
0.771350499259
smaller_cliques_len
0.799955187305


In [39]:
test = test.iloc[:, 3:]

In [40]:
train.to_csv('./input/train_cliques.csv', index=False)

In [41]:
test.to_csv('./input/test_cliques.csv', index=False)