In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

# Feature combiner

Take all the datasets we've created and combine into single vectors for train and test

## Load datasets

In [2]:
dataset_folder = os.path.join(os.pardir, 'Datasets')
feature_files = [
    'FEATURE_entity_counts_{}.csv',
    'magic_feature_{}.csv',
    'FEATURE_topic_sharing_{}.csv',
    'question_type_features_{}.csv',
    'feature_tools_trans_{}.csv',
    'FEATURE_tfidf_{}.csv',
    'graph_features_{}.csv',
    'nlp_features_{}.csv',
    'non_nlp_features_{}.csv',
    'FEATURE_distance_{}.csv'
]

In [3]:
X_train = pd.read_csv(os.path.join(dataset_folder, 'train.csv'), index_col='id')
X_test = pd.read_csv(os.path.join(dataset_folder, 'test.csv'), nrows=2345796, index_col='test_id')

train_shape = X_train.shape
test_shape = X_test.shape

  mask |= (ar1 == a)


In [4]:
for feature_set in feature_files:
    train_feature_set = feature_set.format('train')
    test_feature_set = feature_set.format('test')
    
    try:
        X_train_feature = pd.read_csv(os.path.join(dataset_folder, train_feature_set), index_col='id')
        try:
            X_test_feature = pd.read_csv(os.path.join(dataset_folder, test_feature_set), 
                                         index_col='test_id', nrows=2345796)
        except ValueError as e:
            X_test_feature = pd.read_csv(os.path.join(dataset_folder, test_feature_set), 
                                         index_col='id', nrows=2345796)           
        
        assert X_train_feature.index.nunique() == X_train_feature.shape[0], \
            'Index not unique in feature set {}'.format(train_feature_set)
        assert X_test_feature.index.nunique() == X_test_feature.shape[0], \
            'Index not unique in feature set {}'.format(test_feature_set)
        
    except FileNotFoundError as e:
        print('Can\'t add feature {} as train and test sets don\'t both exist.'.format(feature_set))
        continue
        
    X_train = X_train.merge(X_train_feature, left_index=True, right_index=True, how='left')
    X_test = X_test.merge(X_test_feature, left_index=True, right_index=True, how='left')
        
    assert X_train.shape[0] == train_shape[0], 'Rows added'
    assert X_test.shape[0] == test_shape[0], 'Rows added'
    
y_train = X_train.pop('is_duplicate')

Can't add feature FEATURE_entity_counts_{}.csv as train and test sets don't both exist.


  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


Can't add feature FEATURE_tfidf_{}.csv as train and test sets don't both exist.
Can't add feature nlp_features_{}.csv as train and test sets don't both exist.


In [5]:
X_train

Unnamed: 0_level_0,qid1,qid2,question1,question2,q1_hash,q2_hash,q1_freq,q2_freq,PROB_BOTH_SHARE_TOPIC_0,PROB_BOTH_SHARE_TOPIC_1,...,NUM_WORDS(question1),NUM_WORDS(question2),NUM_CHARACTERS(question1),NUM_CHARACTERS(question2),min_kcore,max_kcore,common_neighbor_count,common_neighbor_ratio,min_freq,max_freq
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,290457,1,2,0.000045,0.000045,...,14,12,66,57,0,0,0,0.000000,1,2
1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,1,16739,8,3,0.000046,0.000046,...,8,13,51,88,0,0,0,0.000000,3,8
2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,2,290458,2,1,0.000060,0.001250,...,14,10,73,59,0,0,0,0.000000,1,2
3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,3,290459,1,1,0.000125,0.010125,...,11,9,50,65,0,0,0,0.000000,1,1
4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,4,290460,3,1,0.000063,0.000063,...,13,7,76,39,0,2,0,0.000000,1,3
5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",5,290461,1,1,0.000035,0.000035,...,16,16,86,90,0,0,0,0.000000,1,1
6,13,14,Should I buy tiago?,What keeps childern active and far from phone ...,6,290462,1,1,0.000208,0.000208,...,4,11,19,62,2,2,0,0.000000,1,1
7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,7,290463,1,1,0.005833,0.000278,...,7,9,30,41,2,7,0,0.000000,1,1
8,17,18,When do you use シ instead of し?,"When do you use ""&"" instead of ""and""?",8,273573,2,3,0.000278,0.000278,...,8,8,31,37,0,0,1,0.500000,2,3
9,19,20,Motorola (company): Can I hack my Charter Moto...,How do I hack Motorola DCX3400 for free internet?,9,290464,1,1,0.000100,0.000100,...,9,9,60,49,0,10,0,0.000000,1,1


In [6]:
train_metadata = ['qid1', 'qid2', 'question1', 'question2']
X_train.drop(columns=train_metadata, inplace=True)
test_metadata = ['question1', 'question2']
X_test.drop(columns=test_metadata, inplace=True)

In [7]:
print(X_train.shape)
X_train.head()

(404290, 48)


Unnamed: 0_level_0,q1_hash,q2_hash,q1_freq,q2_freq,PROB_BOTH_SHARE_TOPIC_0,PROB_BOTH_SHARE_TOPIC_1,PROB_BOTH_SHARE_TOPIC_2,PROB_BOTH_SHARE_TOPIC_3,PROB_BOTH_SHARE_TOPIC_4,PROB_BOTH_SHARE_TOPIC_5,...,NUM_WORDS(question1),NUM_WORDS(question2),NUM_CHARACTERS(question1),NUM_CHARACTERS(question2),min_kcore,max_kcore,common_neighbor_count,common_neighbor_ratio,min_freq,max_freq
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,290457,1,2,4.5e-05,4.5e-05,4.5e-05,4.5e-05,4.5e-05,4.5e-05,...,14,12,66,57,0,0,0,0.0,1,2
1,1,16739,8,3,4.6e-05,4.6e-05,4.6e-05,4.6e-05,4.6e-05,4.6e-05,...,8,13,51,88,0,0,0,0.0,3,8
2,2,290458,2,1,6e-05,0.00125,0.07625,6e-05,6e-05,6e-05,...,14,10,73,59,0,0,0,0.0,1,2
3,3,290459,1,1,0.000125,0.010125,0.000125,0.000125,0.000125,0.007625,...,11,9,50,65,0,0,0,0.0,1,1
4,4,290460,3,1,6.3e-05,6.3e-05,6.3e-05,6.3e-05,6.3e-05,6.3e-05,...,13,7,76,39,0,2,0,0.0,1,3


In [8]:
print(X_test.shape)
X_test.head()

(2345796, 48)


Unnamed: 0_level_0,q1_hash,q2_hash,q1_freq,q2_freq,PROB_BOTH_SHARE_TOPIC_0,PROB_BOTH_SHARE_TOPIC_1,PROB_BOTH_SHARE_TOPIC_2,PROB_BOTH_SHARE_TOPIC_3,PROB_BOTH_SHARE_TOPIC_4,PROB_BOTH_SHARE_TOPIC_5,...,NUM_WORDS(question1),NUM_WORDS(question2),NUM_CHARACTERS(question1),NUM_CHARACTERS(question2),min_kcore,max_kcore,common_neighbor_count,common_neighbor_ratio,min_freq,max_freq
test_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,537361,2680914,1,1,4.5e-05,4.5e-05,4.5e-05,4.5e-05,4.5e-05,4.5e-05,...,11,14,57,68,0,0,0,0.0,1,1
1,537362,1258711,2,2,8.3e-05,0.00175,8.3e-05,0.03675,8.3e-05,8.3e-05,...,14,7,66,43,0,0,0,0.0,2,2
2,537363,2680915,1,1,0.002625,0.000125,0.000125,0.000125,0.000125,0.000125,...,14,6,60,29,0,0,0,0.0,1,1
3,537364,2680916,1,1,0.000417,0.000417,0.000417,0.00875,0.000417,0.000417,...,4,3,27,17,0,0,0,0.0,1,1
4,537365,2680917,1,1,0.1225,0.000278,0.000278,0.000278,0.000278,0.000278,...,4,6,32,30,0,0,0,0.0,1,1


In [9]:
y_train.shape

(404290,)

## Save final feature sets

In [10]:
X_train.to_csv('x_train.csv')
X_test.to_csv('x_test.csv')

In [11]:
y_train.to_csv('y_train.csv', index=False)