In [20]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

# Feature combiner

Take all the datasets we've created and combine into single vectors for train and test

## Load datasets

In [21]:
dataset_folder = os.path.join(os.pardir, 'Datasets')
feature_files = [
    'FEATURE_entity_counts_{}.csv',
    'FEATURE_magic_feature_{}.csv',
    'FEATURE_topic_sharing_{}.csv',
    'FEATURE_question_type_features_{}.csv',
    'FEATURE_feature_tools_trans_{}.csv',
]

In [22]:
X_train = pd.read_csv(os.path.join(dataset_folder, 'train.csv'), index_col='id')
X_test = pd.read_csv(os.path.join(dataset_folder, 'test.csv'), nrows=2345796, index_col='test_id')

train_shape = X_train.shape
test_shape = X_test.shape

  mask |= (ar1 == a)


In [27]:
for feature_set in feature_files:
    train_feature_set = feature_set.format('train')
    test_feature_set = feature_set.format('test')
    
    try:
        X_train_feature = pd.read_csv(os.path.join(dataset_folder, train_feature_set), index_col='id')
        try:
            X_test_feature = pd.read_csv(os.path.join(dataset_folder, test_feature_set), 
                                         index_col='test_id', nrows=2345796)
        except ValueError as e:
            X_test_feature = pd.read_csv(os.path.join(dataset_folder, test_feature_set), 
                                         index_col='id', nrows=2345796)           
        
        assert X_train_feature.index.nunique() == X_train_feature.shape[0], \
            'Index not unique in feature set {}'.format(train_feature_set)
        assert X_test_feature.index.nunique() == X_test_feature.shape[0], \
            'Index not unique in feature set {}'.format(test_feature_set)
        
    except FileNotFoundError as e:
        print('Can\'t add feature {} as train and test sets don\'t both exist.'.format(feature_set))
        continue
        
    X_train = X_train.merge(X_train_feature, left_index=True, right_index=True, how='left')
    X_test = X_test.merge(X_test_feature, left_index=True, right_index=True, how='left')
        
    assert X_train.shape[0] == train_shape[0], 'Rows added'
    assert X_test.shape[0] == test_shape[0], 'Rows added'
    
y_train = X_train.pop('is_duplicate')

Can't add feature FEATURE_entity_counts_{}.csv as train and test sets don't both exist.


  mask |= (ar1 == a)
  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


KeyError: 'is_duplicate'

In [28]:
X_train

Unnamed: 0_level_0,qid1,qid2,question1,question2,is_duplicate_x,Unnamed: 0,q1_hash,q2_hash,q1_freq,q2_freq,...,what_q2,when_q2,where_q2,why_q2,how_q2,difference_between_q2,NUM_WORDS(question1),NUM_WORDS(question2),NUM_CHARACTERS(question1),NUM_CHARACTERS(question2)
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0,0,0,290457,1,2,...,True,False,False,False,False,False,14,12,66,57
1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0,1,1,16739,8,3,...,True,False,False,False,False,False,8,13,51,88
2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0,2,2,290458,2,1,...,False,False,False,False,True,False,14,10,73,59
3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0,3,3,290459,1,1,...,False,True,False,False,False,False,11,9,50,65
4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0,4,4,290460,3,1,...,False,False,False,False,False,False,13,7,76,39
5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1,5,5,290461,1,1,...,True,False,False,False,False,False,16,16,86,90
6,13,14,Should I buy tiago?,What keeps childern active and far from phone ...,0,6,6,290462,1,1,...,True,False,False,False,False,False,4,11,19,62
7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1,7,7,290463,1,1,...,True,False,False,False,False,False,7,9,30,41
8,17,18,When do you use シ instead of し?,"When do you use ""&"" instead of ""and""?",0,8,8,273573,2,3,...,False,True,False,False,False,False,8,8,31,37
9,19,20,Motorola (company): Can I hack my Charter Moto...,How do I hack Motorola DCX3400 for free internet?,0,9,9,290464,1,1,...,False,False,False,False,True,False,9,9,60,49


In [None]:
train_metadata = ['qid1', 'qid2', 'question1', 'question2']
X_train.drop(columns=train_metadata, inplace=True)
test_metadata = ['question1', 'question2']
X_test.drop(columns=test_metadata, inplace=True)

In [None]:
print(X_train.shape)
X_train.head()

In [None]:
print(X_test.shape)
X_test.head()

In [None]:
y_train.shape

## Save final feature sets

In [None]:
X_train.to_csv('x_train.csv')
X_test.to_csv('x_test.csv')

In [None]:
y_train.to_csv('y_train.csv', index=False)