In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import pylab as pl
import sklearn.cross_validation, sklearn.metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
import scipy.stats
import xgboost as xgb

%matplotlib inline
pd.set_option('display.mpl_style', 'default')
pl.rcParams['figure.figsize'] = (20, 10)

column_dtypes = {'ScanCount': 'int8'}

train = pd.read_csv('train.csv', low_memory=False, dtype=column_dtypes)
test = pd.read_csv('test.csv', low_memory=False, dtype=column_dtypes)
types = np.unique(train.TripType.values)
types_dictionary = {t: i for i, t in  enumerate(types)}

In [2]:
def transform_Upc(data):
    data['UpcPrefix'] = data.Upc.map(lambda x: x // 10**10 % 10 ).astype('int8')
    data['UpcManufacturer'] = data.Upc.map(lambda x: (x // 10**5) - (x // 10**10)*10**5 ).astype('int32')
    data['UpcProduct'] = data.Upc.map(lambda x: x % 10**5).astype('int32')
    data.ix[data.Upc == -1, ['UpcPrefix', 'UpcManufacturer', 'UpcProduct']] = -1
    del data['Upc']
    return data

def preprocess_columns(data):
    data.Upc = data.Upc.fillna(-1)
    data.FinelineNumber = data.FinelineNumber.fillna(-1)
    data = transform_Upc(data)
    data.FinelineNumber = data.FinelineNumber.astype('int16')
    return data

train = preprocess_columns(train)
test = preprocess_columns(test)


In [3]:
def get_frequence_feature(feature):
    dict_counts = dict(feature.value_counts())
    series = feature.map(lambda x: dict_counts[x])
    return (series - series.min())/(series.max() - series.min())

def encode_features(data):
    data['FinelineNumberFreq'] = get_frequence_feature(data.FinelineNumber)
    data['UpcManufacturerFreq'] = get_frequence_feature(data.UpcManufacturer)
    data['UpcProductFreq'] = get_frequence_feature(data.UpcProduct)
    
    return data

data = train.append(test)
data = encode_features(data)
train = data[:len(train)]
test = data[len(train):]

del data, test['TripType']

In [4]:
def at(l):
    return l.iloc[0]
def mklist(l):
    return list(l)

train_grouped = train.groupby('VisitNumber').agg({'TripType': at, 'Weekday': at, 'ScanCount': mklist, 
                                                  'DepartmentDescription': at, 'FinelineNumberFreq': mklist,
                                                  'UpcPrefix': mklist, 'UpcManufacturerFreq': mklist,
                                                  'UpcProductFreq': mklist}).reset_index()
test_grouped = test.groupby('VisitNumber').agg({'Weekday': at, 'ScanCount': mklist, 'DepartmentDescription': at, 
                                                'FinelineNumberFreq': mklist, 'UpcPrefix': mklist,
                                                'UpcManufacturerFreq': mklist, 'UpcProductFreq': mklist}).reset_index()

In [5]:
def construct_grouped_features(data):
    
    data['LogNumberOfReturned'] = data.ScanCount.map(lambda x: np.log(-sum([a for a in x if a < 0]) + 1))
    data['NumberOfPurchased'] = data.ScanCount.map(lambda x: sum([a for a in x if a > 0]))
    
    data['MaxScanCount'] = data.ScanCount.map(lambda x: np.max(x)).astype('int8')
    data['MinScanCount'] = data.ScanCount.map(lambda x: np.min(x)).astype('int8')
    data['MaxMinScanCount'] = data.ScanCount.map(lambda x: np.max(x) - np.min(x)).astype('int16')
    data['MeanScanCount'] = data.ScanCount.map(lambda x: np.mean(x))
    
    data['ModeUpcPrefix'] = data.UpcPrefix.map(lambda x: scipy.stats.mode(x)[0][0]).astype('int8')
    #need more upcPrefix
    
    data['ModeFinelineNumberFreq'] = data.FinelineNumberFreq.map(lambda x: scipy.stats.mode(x)[0][0])
    data['MeanFinelineNumberFreq'] = data.FinelineNumberFreq.map(lambda x: np.mean(x))
    data['MaxFinelineNumberFreq'] = data.FinelineNumberFreq.map(lambda x: np.max(x))
    
    data['ModeUpcManufacturerFreq'] = data.UpcManufacturerFreq.map(lambda x: scipy.stats.mode(x)[0][0])
    data['MeanUpcManufacturerFreq'] = data.UpcManufacturerFreq.map(lambda x: np.mean(x))
    data['MaxUpcManufacturerFreq'] = data.UpcManufacturerFreq.map(lambda x: np.max(x))
    
    
    data['ModeUpcProductFreq'] = data.UpcProductFreq.map(lambda x: scipy.stats.mode(x)[0][0])
    data['MeanUpcProductFreq'] = data.UpcProductFreq.map(lambda x: np.mean(x))
    data['MaxUpcProductFreq'] = data.UpcProductFreq.map(lambda x: np.max(x))
    
    data['SumScanCount*FlNFreq'] = data.apply(lambda r: sum(np.array(r.FinelineNumberFreq)*np.array(r.ScanCount)), axis=1)
    data['SumScanCount*UPtFreq'] = data.apply(lambda r: sum(np.array(r.UpcProductFreq)*np.array(r.ScanCount)), axis=1)
    data['SumScanCount*UMrFreq'] = data.apply(lambda r: sum(np.array(r.UpcManufacturerFreq)*np.array(r.ScanCount)), axis=1)

    return data

train_grouped = construct_grouped_features(train_grouped)
test_grouped = construct_grouped_features(test_grouped)

In [None]:
# sns.countplot(x='TripType', hue='Weekday', data=train_grouped)


In [7]:
def construct_dummy_features(data, dummy_columns):
    data = pd.get_dummies(data, sparse=True, dummy_na=False, columns=dummy_columns)
    return data

In [8]:
expanded_features = ['UpcPrefix', 'UpcManufacturerFreq', 'UpcProductFreq', 'FinelineNumberFreq', 'ScanCount']

data = train_grouped.append(test_grouped)
data = data[[col for col in data.columns if col not in expanded_features]]

data = construct_dummy_features(data, ['Weekday', 'DepartmentDescription', 'ModeUpcPrefix'])

train_dummies = data.iloc[:len(train_grouped)]
test_dummies = data.iloc[len(train_grouped):]
del data, test_dummies['TripType']
print(len(test_dummies.columns), 'features constructed')
print('Memory usage of dataframe train_dummies is %3.2f Mb' % (train_dummies.memory_usage(index=True).sum()/(1024*1024)))
print('Memory usage of dataframe test_dummies is %3.2f Mb' % (test_dummies.memory_usage(index=True).sum()/(1024*1024)))

105 features constructed
Memory usage of dataframe train_dummies is 15.69 Mb
Memory usage of dataframe test_dummies is 14.96 Mb


In [9]:
# del train, test, train_grouped, test_grouped

In [10]:
def get_prior_probabilities(data):
    prior_probabilities = np.zeros(len(types))
    prior_probabilities += np.array([len(data[data.TripType == trip_type]) for trip_type in types])
    prior_probabilities /= prior_probabilities.sum()
    return prior_probabilities

In [62]:
def fit_predict(train_array, train_labels, test_array, prior_probabilities=None, algorithm='rf', alpha=None, 
                plot_importance=True, feature_names=None):
    if algorithm == 'xgb':
        with open('xgb.fmap', 'w') as outfile:
            for i, feature_name in enumerate(feature_names):
                outfile.write('{0}\t{1}\tq\n'.format(i, feature_name))

        clf = xgb.XGBClassifier(n_estimators=30, max_depth=8, learning_rate=0.4, objective='multi:softprob', 
                                colsample_bytree=0.45, subsample=0.5, )
        clf.fit(train_array, train_labels)
        prediction_matrix = clf.predict_proba(test_array)
        if plot_importance and feature_names:
            importances = clf.booster().get_fscore('xgb.fmap')
            feature_names = list(importances.keys())
            importances = np.array(list(importances.values()))
            pl.title('Feature Importance')
            sorted_indices = np.argsort(importances)[::-1]
            for i, k in enumerate(sorted_indices):
                print('%2d (feature %2d):' % (i, k), feature_names[k], 'Value = %.5f' % importances[k])
#             print([feature_names[k] for k in sorted_indices])
            sns.barplot(np.arange(len(feature_names)), importances[sorted_indices], color='r')
                        
    else:
        if algorithm == 'rf':
            clf = RandomForestClassifier(n_estimators=100, max_depth=20, min_samples_leaf=1, max_features=0.2, n_jobs=-1)
        clf.fit(train_array, train_labels)
        prediction_matrix = clf.predict_proba(test_array)
        if plot_importance and feature_names:
            importances = clf.feature_importances_
            std = np.std([tree.feature_importances_ for tree in clf.estimators_], axis=0)
            pl.title('Feature Importance')
            sorted_indices = np.argsort(importances)[::-1]
            for i, k in enumerate(sorted_indices):
                print('%2d (feature %2d):' % (i, k), feature_names[k], 'Value = %.5f' % importances[k])
#             print([feature_names[k] for k in sorted_indices])

            pl.bar(range(len(feature_names)), importances[sorted_indices], color='r',
                   yerr=std[sorted_indices], align='center')
            pl.xticks(range(len(feature_names)), sorted_indices)
            pl.xlim([-1, len(feature_names) + 1])
            
    if not prior_probabilities is None:
        coeff = 0.1
        prediction_matrix = (prediction_matrix + [prior_probabilities*coeff]*len(prediction_matrix))/(1. + coeff)
    return prediction_matrix

In [12]:
def train_and_score(train_dummies, grid_space=[0], algorithm='rf', plot_importance=True):
    if len(grid_space) > 1:
        alphas, scores = [], []
        min_alpha, min_score = 1., 10**5
    
    for alpha in grid_space:
        if len(grid_space) > 1:
            print('alpha =', alpha)
        y = train_dummies.TripType.values
        for train_indices, test_indices in sklearn.cross_validation.StratifiedShuffleSplit(y, n_iter=1, test_size=0.1):
            data_train = train_dummies.iloc[train_indices]
            data_test = train_dummies.iloc[test_indices]

    #         prior_probabilities = get_prior_probabilities(data_train)
            feature_names = [col for col in data_train.columns if col not in ['TripType']]
            prediction_matrix = fit_predict(data_train[feature_names].values, 
                                            data_train.TripType.values, 
                                            data_test[[col for col in data_test.columns 
                                                       if col not in ['TripType']]].values, 
                                            algorithm=algorithm, alpha=alpha, plot_importance=plot_importance, 
                                            feature_names=feature_names)

            prediction_matrix = np.delete(prediction_matrix, 7, axis=1)
            y_true = data_test.TripType.values
            
            mlogloss = sklearn.metrics.log_loss(y_true, prediction_matrix) 
            
            print('mlogloss =', mlogloss, end='\n\n')
            
            if len(grid_space) > 1:
                alphas.append(alpha)
                scores.append(mlogloss)
                if mlogloss <= min_score:
                    min_score = mlogloss
                    min_alpha = alpha
    if len(grid_space) > 1:
        return alphas, scores, min_alpha, min_score
    else:
        return mlogloss

##MAIN

In [None]:
algorithms = ['rf', 'xgb']
algorithm = 'xgb'

validation = True

if validation:
    
    compute_optimal = False
    if compute_optimal:
        grid_space = np.linspace(0.1, 0.9, 10)
        alphas, scores, min_alpha, min_score = train_and_score(train_dummies, grid_space=grid_space,
                                                               algorithm=algorithm, plot_importance=False)
        print('MIN ALPHA =', min_alpha, 'MIN_MLOGLOSS =', min_score)
        pl.plot(alphas, scores, 'r.--')
    else:
        mlogloss = train_and_score(train_dummies, algorithm=algorithm)
    
elif not validation:
#     prior_probabilities = get_prior_probabilities(train)
    prediction_matrix = fit_predict(train_dummies[[col for col in train_dummies.columns 
                                                   if col not in ['TripType']]].values, 
                                    train_dummies.TripType.values, test_dummies.values, algorithm=algorithm)
    
    prediction_df = pd.DataFrame(prediction_matrix, columns=['TripType_' + str(i) for i in types])
    prediction_df['VisitNumber'] = test_dummies.VisitNumber.values
    
    prediction_df = prediction_df[['VisitNumber'] + [x for x in prediction_df.columns if x != 'VisitNumber']]
    del prediction_matrix
    
    filename = 'prediction.csv'
    prediction_df.to_csv(filename, index=False)
    print(filename, 'was created')
    del prediction_df
    