# Naive Bayes Third Round
26/11/2016  
ironbar

I'm going to use the new splitted dataset to train a naive bayes model. I will be using a class for the dataset, and this will help me for later using a unified class for the model. 

## Load the dataset

In [1]:
#Imports
import numpy as np
from sklearn.naive_bayes import BernoulliNB
import time
import gzip
import warnings
warnings.filterwarnings("ignore")

from dataset import SantanderDataset
from average_precision import mapk
from genetic_search import genetic_search

In [2]:
dataset_root = '/mnt/F25663CB56638EE3/Kaggle/Santander Product Recommendation/'
dataset = SantanderDataset(dataset_root)

It took 3 seconds to load the dataset
It took 4 seconds to load the dataset
It took 6 seconds to load the dataset
It took 2 seconds to load the dataset
1375686 1375686
561234 561234


I have been testing the class and seems to be working fine. 
When loaded the dataset is using only 500MB of RAM.

## Testing with Naive Bayes

In [3]:
def train_bnb_model(msg):
    """
    Trains a model using the given parameters
    
    month: int or list with the number of the month we want
        the data to be taken of
    input_columns: a list with the name of the columns we are going to use
        in the task
    use_product: bool, if true adds the product columns of the month before
    use_change: bool, if true adds the change columns of the month before
    """
    msg_copy = msg.copy()
    msg_copy['train'] = True
    if not 'month' in msg_copy.keys():
        msg_copy['month'] = msg_copy['train_month']
    #Get the data for training
    ret = dataset.get_data(msg_copy)
    input_data, output_data = ret[0:2]
    #Fit the model
    bnb = BernoulliNB(alpha=1e-2)
    bnb.partial_fit(input_data, output_data, classes = range(24))
    return bnb

In [4]:
def create_prediction(bnb, msg):
    """
    Makes a prediction using the given model and parameters
    
    month: int or list with the number of the month we want
        the data to be taken of
    input_columns: a list with the name of the columns we are going to use
        in the task
    use_product: bool, if true adds the product columns of the month before
    use_change: bool, if true adds the change columns of the month before
    """
    msg_copy = msg.copy()
    msg_copy['train'] = False
    if not 'month' in msg_copy.keys():
        msg_copy['month'] = msg_copy['eval_month']
    #Get the data for making a prediction
    ret = dataset.get_data(msg_copy)
    input_data, output_data, previous_products = ret
    #Get the prediction
    rank = bnb.predict_proba(input_data)
    filtered_rank = np.equal(previous_products, 0) * rank
    predictions = np.argsort(filtered_rank, axis=1)
    predictions = predictions[:,::-1][:,0:7]
    return predictions, output_data

In [53]:
def naive_bayes_workflow(msg):
    """
    Implements all the steps of training and evaluating a naive bayes classifier
    Returns the score and the trained model
    
    train_month: int or list with the number of the month we want
        the data to be taken of for training 
    eval_month: int or list with the number of the month we want
        the data to be taken of for testing
    input_columns: a list with the name of the columns we are going to use
        in the task
    use_product: bool, if true adds the product columns of the month before
    use_change: bool, if true adds the change columns of the month before
    """
    if type(msg['eval_month']) is not list:
        msg['eval_month'] = [msg['eval_month']]
    #Train the model
    bnb = train_bnb_model(msg)
    scores = []
    for month in msg['eval_month']:
        msg_copy = msg.copy()
        msg_copy['month'] = month
        #Create prediction
        predictions, output_data = create_prediction(bnb, msg_copy)
        #Get the score
        score = mapk(output_data, predictions)
        scores.append(score)
    
    return scores, bnb

In [19]:
#Try training
start_time = time.time()
msg = {'train_month': 5,
       'eval_month': 5,
      'input_columns': [],
      'use_product': True,
      'use_change': False}
print naive_bayes_workflow(msg)[0]
print time.time()-start_time

[0.70998466756819612]
0.263981103897


In [16]:
#Try training
start_time = time.time()
msg = {'train_month': 5,
       'eval_month': 5,
      'input_columns': dataset.categorical_columns,
      'use_product': True,
      'use_change': False}
print naive_bayes_workflow(msg)[0]
print time.time()-start_time

[0.70271425855404612]
1.11327409744


In [17]:
#Try training
start_time = time.time()
msg = {'train_month': range(1,17),
       'eval_month': 5,
      'input_columns': dataset.categorical_columns,
      'use_product': True,
      'use_change': False}
print naive_bayes_workflow(msg)[0]
print time.time()-start_time

[0.6443314161539575]
13.2391889095


In [31]:
#Try training
start_time = time.time()
msg = {'train_month': 5,
       'eval_month': [5,16],
      'input_columns': [],
      'use_product': True,
      'use_change': False}
print naive_bayes_workflow(msg)[0]
print time.time()-start_time

[0.70998466756819612, 0.69814177663890675]
0.414514064789


In [36]:
#Try training
start_time = time.time()
msg = {'train_month': [ 1,  2,  3,  6,  9, 12, 14, 15],
       'eval_month': [5,16],
      'input_columns': ['pais_residencia', 'age', 'antiguedad', 'tiprel_1mes',
                        'indresi', 'indext', 'canal_entrada', 'nomprov', 'renta',
                        'segmento', 'month'],
      'use_product': True,
      'use_change': False}
print naive_bayes_workflow(msg)[0]
print time.time()-start_time

[0.40694810750308774, 0.47310000284717768]
3.57399702072


This is very good, it seems to be working correctly and it's much faster than the other implementations, in the worst case I will be running 4 experiments per minute. And in the best case it will run 240 experiments per minute.

In the previos implementation the average time was 2 experiments per minute.
So we get an speedup of 2 to 120.

## Genetic algorithm search

I'm going to launch a new search to see if I can get the same results or better.

In [45]:
def get_genomic_score(test_month, filename, genome, verbose=False):
    """
    Receives only test month and the genome
    Returns the score and saves the configuration and results in a file
    
    If the genome size is 34 then use_product is set to True
    If the genome size is 35 then all the parameters are in the search
    len(categorical_columns) = 18
    So we need a genome of 18+2+1 = 21
    """
    if verbose: print genome
    #Decide which train months to use, we can train from month 1 to 16
    if np.sum(genome[0:15]) > 0:
        used_months = np.array(range(1,5)+range(6,17))[np.array(genome[0:15]) == 1]
        train_month = used_months
    else:
        #Select a random month, excluding 5
        used_months = np.random.randint(1,16,1)[0]
        if used_months >= 5: used_months += 1
        train_month = [used_months]
    if verbose: print 'train_month', train_month
    #Decide wich category input columns to use
    categorical_columns = dataset.categorical_columns
    used_index = np.arange(len(categorical_columns))[
        np.array(genome[15:33]) == 1]
    input_columns = [categorical_columns[i] for i in used_index]
    if verbose: print 'input_columns', input_columns
    #Decide on using change columns and product as input
    use_change = genome[33] == 1
    #This allows to use a shorter genome to fix some properties
    if len(genome) >= 35: 
        use_product = genome[34] == 1
    else:
        use_product = True
    #Build message for training 
    msg ={'train_month':list(train_month),
          'eval_month':test_month,
          'input_columns':input_columns,
          'use_product':use_product,
          'use_change':use_change,
        
    }
    if verbose: print msg
    ret = naive_bayes_workflow(msg)
    #Print and save to file 
    text = '\t'.join([str(a) for a in ret[0]]) + '\t'
    text += '%s\t%s\t' % ( use_change, use_product)
    if verbose: print text
    text += "','".join(input_columns)
    text += "\t" + ",".join([str(a) for a in train_month])
    text += '\n'
    with open(dataset_root+'logs/%s.log' % filename, 'a') as f:
        f.write(text)
        
    return ret[0]

In [48]:
#Define evaluation function
def eval_function_1(individual):
    """
    Tries to optimize just the training score
    """
    ret = get_genomic_score([5,16],'genetic_search_6',individual,verbose=False)
    return ret[0:1]

In [49]:
print 'Starting genetic search'
genetic_search(eval_function_1, 35, 20)

Starting genetic search
gen	nevals	avg     	min     	max     
0  	20    	0.545802	0.209659	0.724581
1  	15    	0.669912	0.474358	0.74746 
2  	15    	0.700577	0.390444	0.736404
3  	14    	0.719218	0.589063	0.74221 
4  	7     	0.733946	0.720324	0.762314
5  	14    	0.746143	0.722385	0.762739
6  	12    	0.755809	0.709811	0.768207
7  	11    	0.743561	0.582151	0.768207
8  	12    	0.756928	0.653292	0.768207
9  	13    	0.76731 	0.762362	0.768207
10 	16    	0.745436	0.615905	0.774615
Best individual is: [0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1]
with fitness: (0.77461455927031386,)


([[0,
   0,
   0,
   0,
   1,
   0,
   1,
   0,
   0,
   1,
   1,
   0,
   1,
   1,
   0,
   1,
   1,
   0,
   1,
   0,
   0,
   1,
   0,
   0,
   1,
   1,
   1,
   0,
   0,
   0,
   1,
   0,
   0,
   1,
   1],
  [0,
   0,
   0,
   0,
   1,
   0,
   1,
   0,
   0,
   1,
   1,
   0,
   1,
   1,
   0,
   1,
   1,
   0,
   1,
   0,
   0,
   1,
   0,
   1,
   1,
   0,
   1,
   0,
   0,
   0,
   1,
   0,
   0,
   1,
   1],
  [0,
   0,
   0,
   0,
   1,
   0,
   1,
   0,
   0,
   1,
   1,
   0,
   1,
   1,
   0,
   1,
   1,
   0,
   1,
   0,
   0,
   1,
   0,
   1,
   1,
   0,
   1,
   0,
   0,
   0,
   1,
   0,
   0,
   1,
   1],
  [0,
   0,
   0,
   0,
   1,
   0,
   1,
   0,
   0,
   1,
   1,
   0,
   1,
   1,
   0,
   1,
   1,
   0,
   1,
   0,
   0,
   1,
   0,
   1,
   1,
   0,
   1,
   0,
   0,
   0,
   1,
   0,
   0,
   1,
   1],
  [0,
   0,
   0,
   1,
   1,
   0,
   0,
   0,
   0,
   1,
   1,
   0,
   1,
   1,
   0,
   1,
   1,
   0,
   1,
   0,
   0,
   1,
   0,
   1,
   1,
   1,


0.774615 is 0.04 points better than my previous best score, so I have to make a submission of this

### Optimizing for month 5 and 16
That search took only a few minutes. So I'm thinking of launching a search with population 10x size_vector which was the recommended parameter.  
Moreover I think that a good combination will be to maximize the sum of month 5 and 16, because we have discovered that both are important. If doing so I can't use the month 16 for training

In [58]:
def get_genomic_score(test_month, filename, genome, verbose=False):
    """
    Receives only test month and the genome
    Returns the score and saves the configuration and results in a file
    It's the same function as above but without training with month 16
    
    If the genome size is 33 then use_product is set to True
    If the genome size is 34 then all the parameters are in the search
    len(categorical_columns) = 18
    So we need a genome of 18+2+1 = 21
    """
    if verbose: print genome
    #Decide which train months to use, we can train from month 1 to 15
    if np.sum(genome[0:14]) > 0:
        used_months = np.array(range(1,5)+range(6,16))[np.array(genome[0:14]) == 1]
        train_month = used_months
    else:
        #Select a random month, excluding 5 and 16
        used_months = np.random.randint(1,15,1)[0]
        if used_months >= 5: used_months += 1
        train_month = [used_months]
    if verbose: print 'train_month', train_month
    #Decide wich category input columns to use
    categorical_columns = dataset.categorical_columns
    used_index = np.arange(len(categorical_columns))[
        np.array(genome[14:32]) == 1]
    input_columns = [categorical_columns[i] for i in used_index]
    if verbose: print 'input_columns', input_columns
    #Decide on using change columns and product as input
    use_change = genome[32] == 1
    #This allows to use a shorter genome to fix some properties
    if len(genome) >= 34: 
        use_product = genome[33] == 1
    else:
        use_product = True
    #Build message for training 
    msg ={'train_month':list(train_month),
          'eval_month':test_month,
          'input_columns':input_columns,
          'use_product':use_product,
          'use_change':use_change,
        
    }
    if verbose: print msg
    ret = naive_bayes_workflow(msg)
    #Print and save to file 
    text = '\t'.join([str(a) for a in ret[0]]) + '\t'
    text += '%s\t%s\t' % ( use_change, use_product)
    if verbose: print text
    text += "','".join(input_columns)
    text += "\t" + ",".join([str(a) for a in train_month])
    text += '\n'
    with open(dataset_root+'logs/%s.log' % filename, 'a') as f:
        f.write(text)
        
    return ret[0]

In [66]:
#Define evaluation function
def eval_function_2(individual):
    """
    Tries to optimize just the training score
    """
    ret = get_genomic_score([5,16],'genetic_search_8',individual,verbose=False)
    return [np.sum(ret)/2]

In [67]:
print 'Starting genetic search'
genetic_search(eval_function_2, 34, 340)
print 

Starting genetic search
gen	nevals	avg     	min     	max   
0  	340   	0.592641	0.188547	0.7812
1  	191   	0.695659	0.215814	0.78298
2  	235   	0.736885	0.458637	0.785192
3  	195   	0.757393	0.467118	0.788487
4  	207   	0.766627	0.477035	0.788897
5  	205   	0.771692	0.496372	0.790543
6  	196   	0.777418	0.582846	0.792135
7  	187   	0.779412	0.432271	0.794846
8  	199   	0.780342	0.468057	0.795329
9  	198   	0.780714	0.58507 	0.795329
10 	224   	0.783707	0.595911	0.795329
Best individual is: [0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1]
with fitness: (0.79532941149303316,)



### Optimizing and training only with month 5
I want to know what the maximun score I can get using only month 5 as input. I don't think this will get a good score at the LB, but it will give me a measure of the overfitting capacity of Naive Bayes

In [68]:
def get_genomic_score(test_month, filename, genome, verbose=False):
    """
    Receives only test month and the genome
    Returns the score and saves the configuration and results in a file
    In this version only trains with month 5
    
    If the genome size is 33 then use_product is set to True
    If the genome size is 34 then all the parameters are in the search
    len(categorical_columns) = 18
    So we need a genome of 18+2+1 = 21
    """
    if verbose: print genome
    train_month = [5]
    if verbose: print 'train_month', train_month
    #Decide wich category input columns to use
    categorical_columns = dataset.categorical_columns
    used_index = np.arange(len(categorical_columns))[
        np.array(genome[0:18]) == 1]
    input_columns = [categorical_columns[i] for i in used_index]
    if verbose: print 'input_columns', input_columns
    #Decide on using change columns and product as input
    use_change = genome[18] == 1
    #This allows to use a shorter genome to fix some properties
    if len(genome) >= 20: 
        use_product = genome[19] == 1
    else:
        use_product = True
    #Build message for training 
    msg ={'train_month':list(train_month),
          'eval_month':test_month,
          'input_columns':input_columns,
          'use_product':use_product,
          'use_change':use_change,
        
    }
    if verbose: print msg
    ret = naive_bayes_workflow(msg)
    #Print and save to file 
    text = '\t'.join([str(a) for a in ret[0]]) + '\t'
    text += '%s\t%s\t' % ( use_change, use_product)
    if verbose: print text
    text += "','".join(input_columns)
    text += "\t" + ",".join([str(a) for a in train_month])
    text += '\n'
    with open(dataset_root+'logs/%s.log' % filename, 'a') as f:
        f.write(text)
        
    return ret[0]

In [70]:
#Define evaluation function
def eval_function_3(individual):
    """
    Tries to optimize just the training score
    """
    ret = get_genomic_score([5,16],'genetic_search_9',individual,verbose=False)
    return ret[0:1]

In [71]:
print 'Starting genetic search'
genetic_search(eval_function_3, 20, 200)
print 

Starting genetic search
gen	nevals	avg     	min     	max     
0  	200   	0.722195	0.604088	0.810613
1  	135   	0.767344	0.608128	0.810376
2  	112   	0.796849	0.627525	0.810419
3  	114   	0.802581	0.705432	0.810376
4  	109   	0.80584 	0.711465	0.810585
5  	103   	0.806342	0.710263	0.810609
6  	117   	0.804791	0.636801	0.810627
7  	110   	0.807389	0.706559	0.810743
8  	122   	0.806079	0.624544	0.810764
9  	108   	0.807707	0.715384	0.810973
10 	125   	0.809105	0.719293	0.811014
Best individual is: [0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1]
with fitness: (0.81101371663578969,)



### Optimizing with all the months
The idea is to leave the option of using all the months for training. And optimize the sum of the month 5 and 16.  
I have seen that optimizing wiht month 5 only for month 5 was not a good idea. But I think optimizing for both months at the same time will get a greater score even if I use those months for also for training

In [74]:
def get_genomic_score(test_month, filename, genome, verbose=False):
    """
    Receives only test month and the genome
    Returns the score and saves the configuration and results in a file
    It's the same function as above but without training with month 16
    
    If the genome size is 35 then use_product is set to True
    If the genome size is 36 then all the parameters are in the search
    len(categorical_columns) = 18
    So we need a genome of 18+2+1 = 21
    """
    if verbose: print genome
    #Decide which train months to use, from 1 to 16
    if np.sum(genome[0:16]) > 0:
        used_months = np.array(range(1,17))[np.array(genome[0:16]) == 1]
        train_month = used_months
    else:
        #Select a random month
        used_months = np.random.randint(1,17,1)[0]
        train_month = [used_months]
    if verbose: print 'train_month', train_month
    #Decide wich category input columns to use
    categorical_columns = dataset.categorical_columns
    used_index = np.arange(len(categorical_columns))[
        np.array(genome[16:34]) == 1]
    input_columns = [categorical_columns[i] for i in used_index]
    if verbose: print 'input_columns', input_columns
    #Decide on using change columns and product as input
    use_change = genome[34] == 1
    #This allows to use a shorter genome to fix some properties
    if len(genome) >= 36: 
        use_product = genome[35] == 1
    else:
        use_product = True
    #Build message for training 
    msg ={'train_month':list(train_month),
          'eval_month':test_month,
          'input_columns':input_columns,
          'use_product':use_product,
          'use_change':use_change,
        
    }
    if verbose: print msg
    ret = naive_bayes_workflow(msg)
    #Print and save to file 
    text = '\t'.join([str(a) for a in ret[0]]) + '\t'
    text += '%s\t%s\t' % ( use_change, use_product)
    if verbose: print text
    text += "','".join(input_columns)
    text += "\t" + ",".join([str(a) for a in train_month])
    text += '\n'
    with open(dataset_root+'logs/%s.log' % filename, 'a') as f:
        f.write(text)
        
    return ret[0]

In [75]:
#Define evaluation function
def eval_function_4(individual):
    """
    Tries to optimize just the training score
    """
    ret = get_genomic_score([5,16],'genetic_search_10',individual,verbose=False)
    return [np.sum(ret)/2]

In [77]:
print 'Starting genetic search'
start_time = time.time()
genetic_search(eval_function_4, 36, 600)
print time.time()-start_time

Starting genetic search
gen	nevals	avg     	min     	max     
0  	600   	0.618919	0.124393	0.801854
1  	372   	0.718242	0.38189 	0.803338
2  	366   	0.760257	0.39911 	0.801854
3  	345   	0.778502	0.433457	0.807679
4  	377   	0.787126	0.562399	0.807949
5  	345   	0.78831 	0.18722 	0.809957
6  	348   	0.789446	0.42857 	0.811588
7  	353   	0.794426	0.423332	0.815588
8  	353   	0.798246	0.184611	0.815588
9  	361   	0.798675	0.525028	0.815588
10 	382   	0.801234	0.149588	0.815665
Best individual is: [0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1]
with fitness: (0.81566519477227328,)
21031.625947


In [None]:
0.797194779659	0.794576261656	True	True	age	3,4,5,7,14,15
0.805916016012	0.818480415688	1.6243964317	True	True	pais_residencia','age','indrel','indrel_1mes','indext','segmento','month	1,2,5,6,10,11,16


## Submission

I have to create a submission function, I will reuse the one from the previous notebook.

In [56]:
def create_submission(filename, msg, 
                        verbose=False):
    """
    Implements all the steps of training and evaluating a naive bayes classifier
    Returns the score and the trained model
    
    train_month: int or list with the number of the month we want
        the data to be taken of for training 
    eval_month: int or list with the number of the month we want
        the data to be taken of for testing
    input_columns: a list with the name of the columns we are going to use
        in the task
    use_product: bool, if true adds the product columns of the month before
    use_change: bool, if true adds the change columns of the month before
    """
    test_month = 17
    #Train the model and get validation scores
    ret = naive_bayes_workflow(msg)
    scores = ret[0]
    bnb = ret[1]
    #Create a prediction
    msg['month'] = test_month
    predictions, output_data = create_prediction(bnb, msg)
    #Create the submission text
    if verbose: print 'Creating text...'
    text='ncodpers,added_products\n'
    for i, ncodpers in enumerate(dataset.eval_current[dataset.eval_current.fecha_dato == test_month].ncodpers):
        text += '%i,' % ncodpers
        for j in predictions[i]:
            text += '%s ' % dataset.product_columns[j]
        text += '\n'
    #Write to file
    if verbose: print 'Writing to file...'
    with gzip.open(dataset_root + 'submissions/%s.csv.gz' % filename, 'w') as f:
        f.write(text)
    
    return scores

In [57]:
#Create submission
start_time = time.time()
msg = {'train_month': [ 6,8,11,12,14,15],
       'eval_month': [5,16],
      'input_columns': ['ind_empleado','pais_residencia','age','indrel',
                        'indresi','indext','canal_entrada','renta'],
      'use_product': True,
      'use_change': True}
print create_submission('NaiveBayes_8',msg)
print time.time()-start_time

[0.77461455927031386, 0.79387365791159514]
38.1280529499


I get a LB score of 0.0256351

In [69]:
#Create submission
start_time = time.time()
msg = {'train_month': [ 3,4,11,15],
       'eval_month': [5,16],
      'input_columns': ['ind_empleado','pais_residencia','sexo',
                        'indrel_1mes','indfall','renta','segmento'],
      'use_product': True,
      'use_change': True}
print create_submission('NaiveBayes_9',msg)
print time.time()-start_time

[0.78770011140463214, 0.79456803473556825]
33.0553078651


I get a LB score of 0.0264266

In [73]:
#Create submission
start_time = time.time()
msg = {'train_month': [5],
       'eval_month': [5,16],
      'input_columns': ['pais_residencia','sexo','age','tiprel_1mes',
                        'canal_entrada','indfall','nomprov','renta'],
      'use_product': True,
      'use_change': True}
print create_submission('NaiveBayes_10',msg)
print time.time()-start_time

[0.81097285715238709, 0.71654820129546593]
39.8754160404


I get a LB score of 0.0256338.   
This is a clear example of overfitting, I’m getting a good score at month 5, but due to only train with that month the LB score is no better

In [None]:
0.797194779659	0.794576261656	True	True	age	3,4,5,7,14,15
0.805916016012	0.818480415688	1.6243964317	True	True	pais_residencia','age','indrel','indrel_1mes','indext','segmento','month	1,2,5,6,10,11,16


In [78]:
#Create submission
start_time = time.time()
msg = {'train_month': [1,2,5,6,10,11,16],
       'eval_month': [5, 16],
      'input_columns': ['pais_residencia','age','indrel','indrel_1mes','indext','segmento','month'],
      'use_product': True,
      'use_change': True}
print create_submission('NaiveBayes_11',msg)
print time.time()-start_time

[0.80591601601215246, 0.81848041568794927]
33.9030869007


I get a LB score of , 87 in the classification( top 9%)  
That's very good for Naive Bayes