### **LIBRARIES**

In [1]:
import os 

### **DATA PATHS, GLOBAL VARIABLES**

In [18]:
DATA_DIR   = "data/"

# Model 1 datasets
FILE_MODEL1_TRAIN    = os.path.join(DATA_DIR, "final/model1_train.txt")
FILE_MODEL1_VALIDATE = os.path.join(DATA_DIR, "final/model1_validate.txt")
FILE_MODEL1_TEST     = os.path.join(DATA_DIR, "final/model1_test.txt")

# BASELINE
# Datasets with field: WORDS only
FILE_BASE_TRAIN    = os.path.join(DATA_DIR, "experiment_model1/base_train.txt")
FILE_BASE_VALIDATE = os.path.join(DATA_DIR, "experiment_model1/base_validate.txt")
FILE_BASE_TEST     = os.path.join(DATA_DIR, "experiment_model1/base_test.txt")



# COMBINATIONS OF FIELDS (all combinations include Baseline, i.e. feature 1)

# Datasets with field: WORDS + POS TAGS only
FILE_POS_TRAIN    = os.path.join(DATA_DIR, "experiment_model1/pos_train.txt")
FILE_POS_VALIDATE = os.path.join(DATA_DIR, "experiment_model1/pos_validate.txt")
FILE_POS_TEST     = os.path.join(DATA_DIR, "experiment_model1/pos_test.txt")

# Datasets with field: WORDS + POS BIGRAMS only
FILE_BIGRAM_TRAIN    = os.path.join(DATA_DIR, "experiment_model1/bigram_train.txt")
FILE_BIGRAM_VALIDATE = os.path.join(DATA_DIR, "experiment_model1/bigram_validate.txt")
FILE_BIGRAM_TEST     = os.path.join(DATA_DIR, "experiment_model1/bigram_test.txt")

# Datasets with field: WORDS + POS TRIGRAMS only
FILE_TRIGRAM_TRAIN    = os.path.join(DATA_DIR, "experiment_model1/trigram_train.txt")
FILE_TRIGRAM_VALIDATE = os.path.join(DATA_DIR, "experiment_model1/trigram_validate.txt")
FILE_TRIGRAM_TEST     = os.path.join(DATA_DIR, "experiment_model1/trigram_test.txt")

# Datasets with field: WORDS + PRODUCTIONS only
FILE_PROD_TRAIN    = os.path.join(DATA_DIR, "experiment_model1/prod_train.txt")
FILE_PROD_VALIDATE = os.path.join(DATA_DIR, "experiment_model1/prod_validate.txt")
FILE_PROD_TEST     = os.path.join(DATA_DIR, "experiment_model1/prod_test.txt")




# Datasets with field: WORDS + POS BIGRAMS + POS TRIGRAMS only
FILE_BIGRAM_TRIGRAM_TRAIN    = os.path.join(DATA_DIR, "experiment_model1/bigram_trigram_train.txt")
FILE_BIGRAM_TRIGRAM_VALIDATE = os.path.join(DATA_DIR, "experiment_model1/bigram_trigram_validate.txt")
FILE_BIGRAM_TRIGRAM_TEST     = os.path.join(DATA_DIR, "experiment_model1/bigram_trigram_test.txt")

# Datasets with field: WORDS + POS + POS BIGRAMS + POS TRIGRAMS only
FILE_POS_BIGRAM_TRIGRAM_TRAIN    = os.path.join(DATA_DIR, "experiment_model1/pos_bigram_trigram_train.txt")
FILE_POS_BIGRAM_TRIGRAM_VALIDATE = os.path.join(DATA_DIR, "experiment_model1/pos_bigram_trigram_validate.txt")
FILE_POS_BIGRAM_TRIGRAM_TEST     = os.path.join(DATA_DIR, "experiment_model1/pos_bigram_trigram_test.txt")

In [19]:
def get_data_pos(data_type):
    """
    For Model 1, define the file path for the dataset with parts of speech features
    Parameters:
        data_type: One of the following strings:
            train, validate, test
    Return:
        FILE_READ: path of file to be read
        FILE_WRITE: path of file to write to
    """
    if data_type == 'train':
        FILE_READ = FILE_MODEL1_TRAIN
        FILE_WRITE = FILE_POS_TRAIN
        
    elif data_type == 'validate':   
        FILE_READ = FILE_MODEL1_VALIDATE
        FILE_WRITE = FILE_POS_VALIDATE
        
    elif data_type == 'test':
        FILE_READ = FILE_MODEL1_TEST
        FILE_WRITE = FILE_POS_TEST
        
    return FILE_READ, FILE_WRITE   

In [20]:
def get_data_bigram(data_type):
    """
    For Model 1, define the file path for the dataset with POS bigram features
    Parameters:
        data_type: One of the following strings:
            train, validate, test
    Return:
        FILE_READ: path of file to be read
        FILE_WRITE: path of file to write to
    """
    if data_type == 'train':
        FILE_READ = FILE_MODEL1_TRAIN
        FILE_WRITE = FILE_BIGRAM_TRAIN
        
    elif data_type == 'validate':   
        FILE_READ = FILE_MODEL1_VALIDATE
        FILE_WRITE = FILE_BIGRAM_VALIDATE
        
    elif data_type == 'test':
        FILE_READ = FILE_MODEL1_TEST
        FILE_WRITE = FILE_BIGRAM_TEST
        
    return FILE_READ, FILE_WRITE   

In [21]:
def get_data_trigram(data_type):
    """
    For Model 1, define the file path for the dataset with words trigram features
    Parameters:
        data_type: One of the following strings:
            train, validate, test
    Return:
        FILE_READ: path of file to be read
        FILE_WRITE: path of file to write to
    """    
    if data_type == 'train':
        FILE_READ = FILE_MODEL1_TRAIN
        FILE_WRITE = FILE_TRIGRAM_TRAIN
        
    elif data_type == 'validate':   
        FILE_READ = FILE_MODEL1_VALIDATE
        FILE_WRITE = FILE_TRIGRAM_VALIDATE
        
    elif data_type == 'test':
        FILE_READ = FILE_MODEL1_TEST
        FILE_WRITE = FILE_TRIGRAM_TEST
        
    return FILE_READ, FILE_WRITE   

In [22]:
def get_data_prod(data_type):
    """
    For Model 1, define the file path for the dataset with production features
    Parameters:
        data_type: One of the following strings:
            train, validate, test
    Return:
        FILE_READ: path of file to be read
        FILE_WRITE: path of file to write to
    """        
    if data_type == 'train':
        FILE_READ = FILE_MODEL1_TRAIN
        FILE_WRITE = FILE_PROD_TRAIN
        
    elif data_type == 'validate':   
        FILE_READ = FILE_MODEL1_VALIDATE
        FILE_WRITE = FILE_PROD_VALIDATE
        
    elif data_type == 'test':
        FILE_READ = FILE_MODEL1_TEST
        FILE_WRITE = FILE_PROD_TEST
        
    return FILE_READ, FILE_WRITE   

In [23]:
def get_data_bigram_trigram(data_type):
    """
    For Model 1, define the file path for the dataset with words bigram and trigram features
    Parameters:
        data_type: One of the following strings:
            train, validate, test
    Return:
        FILE_READ: path of file to be read
        FILE_WRITE: path of file to write to
    """        
    if data_type == 'train':
        FILE_READ = FILE_MODEL1_TRAIN
        FILE_WRITE = FILE_BIGRAM_TRIGRAM_TRAIN
        
    elif data_type == 'validate':   
        FILE_READ = FILE_MODEL1_VALIDATE
        FILE_WRITE = FILE_BIGRAM_TRIGRAM_VALIDATE
        
    elif data_type == 'test':
        FILE_READ = FILE_MODEL1_TEST
        FILE_WRITE = FILE_BIGRAM_TRIGRAM_TEST
        
    return FILE_READ, FILE_WRITE 

In [24]:
def get_data_pos_bigram_trigram(data_type):
    """
    For Model 1, define the file path for the dataset with POS bigram and trigram features
    Parameters:
        data_type: One of the following strings:
            train, validate, test
    Return:
        FILE_READ: path of file to be read
        FILE_WRITE: path of file to write to
    """            
    if data_type == 'train':
        FILE_READ = FILE_MODEL1_TRAIN
        FILE_WRITE = FILE_POS_BIGRAM_TRIGRAM_TRAIN
        
    elif data_type == 'validate':   
        FILE_READ = FILE_MODEL1_VALIDATE
        FILE_WRITE = FILE_POS_BIGRAM_TRIGRAM_VALIDATE
        
    elif data_type == 'test':
        FILE_READ = FILE_MODEL1_TEST
        FILE_WRITE = FILE_POS_BIGRAM_TRIGRAM_TEST
        
    return FILE_READ, FILE_WRITE 

In [25]:
def get_data_base(data_type):
    """
    For Model 1, define the file path for the dataset with features for baseline model
    Parameters:
        data_type: One of the following strings:
            train, validate, test
    Return:
        FILE_READ: path of file to be read
        FILE_WRITE: path of file to write to
    """                
    if data_type == 'train':
        FILE_READ = FILE_MODEL1_TRAIN
        FILE_WRITE = FILE_BASE_TRAIN
        
    elif data_type == 'validate':   
        FILE_READ = FILE_MODEL1_VALIDATE
        FILE_WRITE = FILE_BASE_VALIDATE
        
    elif data_type == 'test':
        FILE_READ = FILE_MODEL1_TEST
        FILE_WRITE = FILE_BASE_TEST
        
    return FILE_READ, FILE_WRITE   

In [26]:
def experiment_dataset(FILE_READ, FILE_WRITE, model):
    """
    Parameters:
        FILE_READ: file path of dataset to be read
        FILE_WRITE: file path of dataset to write to
        
    A Model 1 record:
    index 0: 0 
    index 1: qid:9007 
    index 2: 1:8.108303  
    
    index 3:   2:1.7645977       pos 
    index 4:   3:1.3150331       bi
    index 5:   4:0.0             tri
    index 6:   5:4.730212       prod
    
    index 7: # 
    index 8: docid:436
    """
    data_list = []
    data_list_final = []
    
    if model=='base':
        index_list = [1,2,7]   

    if model=='pos':
        index_list = [1,2,3,7]   

    elif model=='bi':
        index_list = [1,2,4,7]  

    elif model=='tri':
        index_list = [1,2,5,7]  
    
    elif model=='bitri':
        index_list = [1,2,4,5,7]   

    elif model=='pos_bitri':
        index_list = [1,2,3,4,5,7] 
     
    elif model=='prod':
        index_list = [1,2,6,7]
    

        
    with open(FILE_READ, 'r') as fread:
        for line in fread:
            data_list.append(line)

    for i, record in enumerate(data_list):
        
        record_list = record.split()
        
        for k, feature in enumerate(record_list):
            
            if k==0:
                data_list_final.append(feature)
                
            elif k in index_list:
                data_list_final[i] += ' ' + feature
                
            elif k==8:
                data_list_final[i] += ' ' + feature + '\n' 
                
    with open(FILE_WRITE, 'w') as fwrite:
        for x in data_list_final:
            fwrite.write(x)

### **RUN!**

In [11]:
FILE_READ, FILE_WRITE = get_data_pos('train')  
experiment_dataset(FILE_READ, FILE_WRITE, 'pos')

FILE_READ, FILE_WRITE = get_data_pos('validate')  
experiment_dataset(FILE_READ, FILE_WRITE, 'pos')

FILE_READ, FILE_WRITE = get_data_pos('test')  
experiment_dataset(FILE_READ, FILE_WRITE, 'pos')

In [12]:
FILE_READ, FILE_WRITE = get_data_bigram('train')  
experiment_dataset(FILE_READ, FILE_WRITE, 'bi')

FILE_READ, FILE_WRITE = get_data_bigram('validate')  
experiment_dataset(FILE_READ, FILE_WRITE, 'bi')

FILE_READ, FILE_WRITE = get_data_bigram('test')  
experiment_dataset(FILE_READ, FILE_WRITE, 'bi')

In [13]:
FILE_READ, FILE_WRITE = get_data_trigram('train')  
experiment_dataset(FILE_READ, FILE_WRITE, 'tri')

FILE_READ, FILE_WRITE = get_data_trigram('validate')  
experiment_dataset(FILE_READ, FILE_WRITE, 'tri')

FILE_READ, FILE_WRITE = get_data_trigram('test')  
experiment_dataset(FILE_READ, FILE_WRITE, 'tri')

In [14]:
FILE_READ, FILE_WRITE = get_data_prod('train')  
experiment_dataset(FILE_READ, FILE_WRITE, 'prod')

FILE_READ, FILE_WRITE = get_data_prod('validate')  
experiment_dataset(FILE_READ, FILE_WRITE, 'prod')

FILE_READ, FILE_WRITE = get_data_prod('test')  
experiment_dataset(FILE_READ, FILE_WRITE, 'prod')

In [15]:
FILE_READ, FILE_WRITE = get_data_bigram_trigram('train')  
experiment_dataset(FILE_READ, FILE_WRITE, 'bitri')

FILE_READ, FILE_WRITE = get_data_bigram_trigram('validate')  
experiment_dataset(FILE_READ, FILE_WRITE, 'bitri')

FILE_READ, FILE_WRITE = get_data_bigram_trigram('test')  
experiment_dataset(FILE_READ, FILE_WRITE, 'bitri')

In [16]:
FILE_READ, FILE_WRITE = get_data_pos_bigram_trigram('train')  
experiment_dataset(FILE_READ, FILE_WRITE, 'pos_bitri')

FILE_READ, FILE_WRITE = get_data_pos_bigram_trigram('validate')  
experiment_dataset(FILE_READ, FILE_WRITE, 'pos_bitri')

FILE_READ, FILE_WRITE = get_data_pos_bigram_trigram('test')  
experiment_dataset(FILE_READ, FILE_WRITE, 'pos_bitri')

In [27]:
FILE_READ, FILE_WRITE = get_data_base('train')  
experiment_dataset(FILE_READ, FILE_WRITE, 'base')

FILE_READ, FILE_WRITE = get_data_base('validate')  
experiment_dataset(FILE_READ, FILE_WRITE, 'base')

FILE_READ, FILE_WRITE = get_data_base('test')  
experiment_dataset(FILE_READ, FILE_WRITE, 'base')