In [None]:
from time import time
from itertools import combinations
from sklearn import preprocessing

import scipy as sp, numpy as np, pandas as pd
import zipfile

def concat(df, columns):
    """ Return concatenated fields in a dataframe """
    return np.array([''.join(x) for x in np.array(
        [np.array(df[col].values, dtype=str) for col in columns]).T])

def prepare_data(shuffle=True):
    """ Data cleansing, feature engineering """
    dirname = '../input/allstate-purchase-prediction-challenge'
    zipf_test = 'test_v2.csv.zip'
    csvf_test = 'test_v2.csv'
    zipf_train = 'train.csv.zip'
    csvf_train = 'train.csv'
    test_ex = zipfile.ZipFile(os.path.join(dirname, zipf_test)).extract(csvf_test)
    train_ex = zipfile.ZipFile(os.path.join(dirname, zipf_train)).extract(csvf_train)
    
    test = pd.read_csv(test_ex).set_index('customer_ID')
    alldata = pd.read_csv(train_ex).set_index('customer_ID')

    # handy lists of features
    con = ['group_size','car_age','age_oldest','age_youngest','duration_previous','cost']
    cat = ['homeowner','car_value','risk_factor','married_couple','C_previous','state', 'location','shopping_pt']
    conf = ['A','B','C','D','E','F','G']
    conf_f = [col+'_f' for col in conf]

    final_purchase = alldata[alldata.record_type == 1]          # final purchase
    data = alldata.join(final_purchase[conf], rsuffix='_f')     # creating training dataset with target features
    data = data[data.record_type == 0]                          # removing final purchase

    data['conf'] = concat(data,conf_f)                          # handy purchase plan 
    data['conf_init'] = concat(data,conf)                       # handy last quoted plan

    encoders = dict()
    data = data.append(test)

    # Fix NAs
    data['C_previous'].fillna(0, inplace=True)
    data['duration_previous'].fillna(0, inplace=True)
    data.location.fillna(-1, inplace=True)
    
    # Transform data to numerical data
    for col in ['car_value','risk_factor','state']:
        encoders[col] = preprocessing.LabelEncoder()
        if col == 'risk_factor':
            data[col].fillna(99, inplace = True)
        else:
            data[col].fillna('99', inplace = True)                
        data[col] = encoders[col].fit_transform(data[col])
    
    print('Location substitution:')
    ## get rid of very location, given the total count from train,cv and test set
    x = data[data.shopping_pt==2].location.value_counts()
    sub = data.location.map(x).fillna(0) < 5    
    data.loc[sub,'location'] = data.state[sub]
    
    # Derived Value
    extra = []
    data['caCost'] = data.cost / (data.car_age+1) # cost per car_age
    data['ppCost'] = data.cost / data.group_size # cost per person
    data['stCost'] = data.state.map(data.groupby('state')['cost'].mean()) # cost per state
    extra.extend(['caCost','ppCost','stCost'])

    # average quote cost by G values
    data['costG'] = data['G'].map(data.groupby('G')['cost'].mean())
    extra.append('costG')

    # average quote cost by G & state values
    x = data.groupby(['G','state'])['cost'].mean()
    x = x.reset_index().set_index(['G','state'])
    x.columns = ['costStG']   # covert to DF
    data = data.merge(x,left_on=['G','state'],right_index=True,how='left')
    extra.append('costStG')
            
    # previous G
    data['prev_G'] = data.G.shift(1)
    extra.append('prev_G')
    data.loc[data.shopping_pt == 1,'prev_G'] = data.loc[data.shopping_pt==1,'G']
    
    # separating training & test data
    test = data[data.conf.isnull()]
    data = data[-data.conf.isnull()]
    
    # SHUFFLE THE DATASET, keeping the same customers transaction in order
    if shuffle:
        print("Shuffling dataset...")
        np.random.seed(9)
        ids = np.unique(data.index.values)
        rands = pd.Series(np.random.random_sample(len(ids)),index=ids)
        data['rand'] = data.reset_index()['customer_ID'].map(rands).values
        data.sort_values(by = ['rand','shopping_pt'], inplace=True)

    # convert to int due to emtpy values in test set
    for col in conf_f: 
        data[col] = np.array(data[col].values,dtype=np.int8)
    
    return data,test,con,cat,extra,conf,conf_f,encoders

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import operator
from sklearn.ensemble import RandomForestClassifier
from time import time

# initialize data
data,test,con,cat,extra,conf,conf_f,encoders = prepare_data()
print("Prepare Data...")
data = data[data.shopping_pt >= 1] 

# features, target
X = data[con+cat+conf+extra]
y = data['G_f']

ntree = 50
maxfea = 5
leafsize = 23

rfs = RandomForestClassifier(n_estimators=ntree, max_features=maxfea, min_samples_leaf=leafsize, random_state=42)

# fits the random forests at the same time
print("fit...") 
rfs.fit(X, y)

# MAKE SUBMISSION
tmp = test.reset_index().customer_ID.map(test.reset_index().groupby('customer_ID').shopping_pt.max())
s_max = pd.Series(tmp.values, index = test.shopping_pt.index)

test = pd.concat([test, s_max], axis = 1)
test.rename(columns = {0:'shopping_pt_max'}, inplace = True)
test = test[test.shopping_pt == test.shopping_pt_max]
    
Xt = test[con+cat+conf+extra]

# TEST SET PREDICTION
print("now predicting on test set...")
allpreds = rfs.predict(Xt)
test['pG'] = allpreds

# Fix state law products, then concatenate to string
test['plan'] = concat(test,['A','B','C','D','E','F','pG'])
test['plan'].to_csv('Allstate_Purchase_Prediction_Challenge.csv', header=1)