In [29]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

# Feature combiner

Take all the datasets we've created and combine into single vectors for train and test

## Load datasets

In [30]:
dataset_folder = os.path.join(os.pardir, 'Datasets')
feature_files = [
    'FEATURE_distances_{}.csv',
    'FEATURE_lstm_pred_{}.csv',
    'FEATURE_entity_counts_{}.csv',
    'FEATURE_magic_feature_{}.csv',
    'FEATURE_topic_sharing_{}.csv',
    'FEATURE_question_type_features_{}.csv',
    'FEATURE_feature_tools_trans_{}.csv',
]

In [31]:
X_train = pd.read_csv(os.path.join(dataset_folder, 'train.csv'), index_col='id')
X_test = pd.read_csv(os.path.join(dataset_folder, 'test.csv'), nrows=2345796, index_col='test_id')

train_shape = X_train.shape
test_shape = X_test.shape

  mask |= (ar1 == a)


In [32]:
for feature_set in feature_files:
    train_feature_set = feature_set.format('train')
    test_feature_set = feature_set.format('test')
    
    try:
        X_train_feature = pd.read_csv(os.path.join(dataset_folder, train_feature_set), index_col='id')
        try:
            X_test_feature = pd.read_csv(os.path.join(dataset_folder, test_feature_set), 
                                         index_col='test_id', nrows=2345796)
        except ValueError as e:
            X_test_feature = pd.read_csv(os.path.join(dataset_folder, test_feature_set), 
                                         index_col='id', nrows=2345796)           
        
        assert X_train_feature.index.nunique() == X_train_feature.shape[0], \
            'Index not unique in feature set {}'.format(train_feature_set)
        assert X_test_feature.index.nunique() == X_test_feature.shape[0], \
            'Index not unique in feature set {}'.format(test_feature_set)
        
    except FileNotFoundError as e:
        print('Can\'t add feature {} as train and test sets don\'t both exist.'.format(feature_set))
        continue
        
    X_train = X_train.merge(X_train_feature, left_index=True, right_index=True, how='left')
    X_test = X_test.merge(X_test_feature, left_index=True, right_index=True, how='left')
        
    assert X_train.shape[0] == train_shape[0], 'Rows added'
    assert X_test.shape[0] == test_shape[0], 'Rows added'
    
y_train = X_train.pop('is_duplicate')

Can't add feature FEATURE_entity_counts_{}.csv as train and test sets don't both exist.


  mask |= (ar1 == a)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [34]:
train_metadata = ['qid1', 'qid2', 'question1', 'question2']
X_train.drop(columns=train_metadata, inplace=True)
test_metadata = ['question1', 'question2']
X_test.drop(columns=test_metadata, inplace=True)

In [35]:
print(X_train.shape)
X_train.head()

(404290, 42)


Unnamed: 0_level_0,q1_hash,q2_hash,q1_freq,q2_freq,PROB_BOTH_SHARE_TOPIC_0,PROB_BOTH_SHARE_TOPIC_1,PROB_BOTH_SHARE_TOPIC_2,PROB_BOTH_SHARE_TOPIC_3,PROB_BOTH_SHARE_TOPIC_4,PROB_BOTH_SHARE_TOPIC_5,...,what_q2,when_q2,where_q2,why_q2,how_q2,difference_between_q2,NUM_WORDS(question1),NUM_WORDS(question2),NUM_CHARACTERS(question1),NUM_CHARACTERS(question2)
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,290457,1,2,4.5e-05,4.5e-05,4.5e-05,4.5e-05,4.5e-05,4.5e-05,...,True,False,False,False,False,False,14,12,66,57
1,1,16739,8,3,4.6e-05,4.6e-05,4.6e-05,4.6e-05,4.6e-05,4.6e-05,...,True,False,False,False,False,False,8,13,51,88
2,2,290458,2,1,6e-05,0.00125,0.07625,6e-05,6e-05,6e-05,...,False,False,False,False,True,False,14,10,73,59
3,3,290459,1,1,0.000125,0.010125,0.000125,0.000125,0.000125,0.007625,...,False,True,False,False,False,False,11,9,50,65
4,4,290460,3,1,6.3e-05,6.3e-05,6.3e-05,6.3e-05,6.3e-05,6.3e-05,...,False,False,False,False,False,False,13,7,76,39


In [36]:
print(X_test.shape)
X_test.head()

(2345796, 42)


Unnamed: 0_level_0,q1_hash,q2_hash,q1_freq,q2_freq,PROB_BOTH_SHARE_TOPIC_0,PROB_BOTH_SHARE_TOPIC_1,PROB_BOTH_SHARE_TOPIC_2,PROB_BOTH_SHARE_TOPIC_3,PROB_BOTH_SHARE_TOPIC_4,PROB_BOTH_SHARE_TOPIC_5,...,what_q2,when_q2,where_q2,why_q2,how_q2,difference_between_q2,NUM_WORDS(question1),NUM_WORDS(question2),NUM_CHARACTERS(question1),NUM_CHARACTERS(question2)
test_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,537361,2680914,1,1,4.5e-05,4.5e-05,4.5e-05,4.5e-05,4.5e-05,4.5e-05,...,False,False,False,True,False,False,11,14,57,68
1,537362,1258711,2,2,8.3e-05,0.00175,8.3e-05,0.03675,8.3e-05,8.3e-05,...,False,False,False,False,True,False,14,7,66,43
2,537363,2680915,1,1,0.002625,0.000125,0.000125,0.000125,0.000125,0.000125,...,True,False,False,False,False,False,14,6,60,29
3,537364,2680916,1,1,0.000417,0.000417,0.000417,0.00875,0.000417,0.000417,...,True,False,False,False,False,False,4,3,27,17
4,537365,2680917,1,1,0.1225,0.000278,0.000278,0.000278,0.000278,0.000278,...,False,False,False,False,True,False,4,6,32,30


In [37]:
y_train.shape

(404290,)

## Save final feature sets

In [41]:
X_train.to_csv('x_train.csv')
X_test.to_csv('x_test.csv')

In [42]:
y_train.to_csv('y_train.csv', index=False)