In [2]:
import mmh3
import numpy as np
import pandas as pd
from scipy import sparse

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score
from sklearn.cross_validation import train_test_split



In [3]:
df_raw = pd.read_table('000.gz',
                       compression='gzip',
                       sep='\t',
                       quotechar='"',
                       error_bad_lines=False) # drop if a line has too many fields

Skipping line 5749513: expected 26 fields, saw 27

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df_raw.columns

Index([u'advertiser_app_store_id', u'country_code', u'city', u'campaign_id',
       u'creative_id', u'device_language', u'device_make', u'device_model',
       u'device_platform', u'device_connection', u'device_os_version',
       u'device_screen_height', u'device_screen_width', u'device_volume',
       u'event_id', u'n_campaign_views', u'n_vungle_installs',
       u'publisher_app_store_id', u'time_of_last_delivery_this_campaign',
       u'time_of_last_delivery_this_creative',
       u'time_of_last_delivery_any_installed_app',
       u'time_of_last_vungle_delivery', u'time_of_this_impression',
       u'timestamp_at_install', u'time_of_this_request', u'time_zone'],
      dtype='object')

In [5]:
train_df = df_raw[['advertiser_app_store_id', 'publisher_app_store_id', 'country_code', 'timestamp_at_install']].copy()
train_df['is_install'] = train_df['timestamp_at_install'].notnull() * 1
train_df.reset_index(drop=True, inplace=True)
del train_df['timestamp_at_install']

In [6]:
train_df.head()

Unnamed: 0,advertiser_app_store_id,publisher_app_store_id,country_code,is_install
0,585215f054dbb89701000ba2,556f7506790e4e9f32000181,US,0
1,806077016,56a1169c2ef5591f35000023,CO,0
2,5885d6eb98d991385100051c,576c95f871edf0ea54000010,KR,0
3,com.sgn.pandapop.gp,583c56b7c3e0af130a000085,US,0
4,58772d7e2ca160032d000399,com.fgol.HungrySharkEvolution,PH,0


In [7]:
# Hashing function
def _murmur_32s(key, seed):
    if isinstance(key, unicode):
        bkey = key.encode('utf-8')
    elif isinstance(key, bytes):
        bkey = key
    else:
        raise ValueError("the key must be either unicode or str")
    return mmh3.hash(bkey, seed)

In [8]:
# Hash features of DataFrame X using the hashing function
def _transform(X, n_bits, categorical_features,
              continuous_features, interaction_features,
              store_fmap=False):
    n_samples = X.shape[0] \
        if isinstance(X, pd.DataFrame) \
        else len(X.values()[0])
    hash_mask = 2 ** n_bits - 1
    n_features = \
        len(categorical_features) + \
        len(continuous_features) + \
        len(interaction_features)
    n_hashed_features = n_samples * n_features
    # assert n_hashed_features > 0
    rows = np.empty(n_hashed_features, dtype=np.int32)
    cols = np.empty(n_hashed_features, dtype=np.int32)
    vals = np.zeros(n_hashed_features)
    hashed_feature_idx = 0
    f_map = {}

    for f in categorical_features:
        Xf = X[f]
        hash_seed = _murmur_32s(f, 0)
        for sample_idx in range(n_samples):
            hash_value = _murmur_32s(Xf[sample_idx], hash_seed)  # what does this part do?
            print hash_value
            hash_sign = (hash_value >= 0) * 2 - 1

            if store_fmap:
                f_combined = ((f,), Xf[sample_idx])
                if f_combined not in f_map:
                    f_map[f_combined] = hash_value & hash_mask

            rows[hashed_feature_idx] = sample_idx
            cols[hashed_feature_idx] = hash_value & hash_mask
            vals[hashed_feature_idx] += hash_sign
            hashed_feature_idx += 1

    for f in continuous_features:
        Xf = X[f]
        hash_value = _murmur_32s(f, 0)
        hash_sign = (hash_value >= 0) * 2 - 1
        if store_fmap:
            f_combined = ((f,),)
            f_map[f_combined] = hash_value & hash_mask
        for sample_idx in range(n_samples):
            rows[hashed_feature_idx] = sample_idx
            cols[hashed_feature_idx] = hash_value & hash_mask
            vals[hashed_feature_idx] += hash_sign * Xf[sample_idx]
            hashed_feature_idx += 1

    for feature_names in interaction_features:
        hash_seed = 0
        for f in feature_names:
            hash_seed = _murmur_32s(f, hash_seed)

        for sample_idx in range(n_samples):
            hash_value = hash_seed
            interaction_value = 1

            value_cache = ()
            for f in feature_names:
                if f in continuous_features:
                    interaction_value *= X[f][sample_idx]
                    value_cache += (f,)
                else:
                    value_cache += (X[f][sample_idx],)
                    hash_value = _murmur_32s(
                        X[f][sample_idx], hash_value
                    )

            if store_fmap:
                f_combined = (feature_names, value_cache)
                if f_combined not in f_map:
                    f_map[f_combined] = hash_value & hash_mask

            hash_sign = (hash_value >= 0) * 2 - 1
            rows[hashed_feature_idx] = sample_idx
            cols[hashed_feature_idx] = hash_value & hash_mask
            vals[hashed_feature_idx] += hash_sign * interaction_value
            hashed_feature_idx += 1

    n_dim_hashed_features = hash_mask + 1

    # reverse k and v, if v is duplicated, append k to v
    f_map_rev = {}
    for k, v in f_map.items():
        if v not in f_map_rev:
            f_map_rev[v] = [k]
        else:
            f_map_rev[v] = f_map_rev[v] + [k]

    return sparse.coo_matrix(
        (vals, (rows, cols)),
        (n_samples, n_dim_hashed_features)
    ).tocsr(), f_map_rev

In [9]:
# Wrapper class for hashing function
class FeatureHasher(BaseEstimator, TransformerMixin):
    def __init__(self,
                 n_bits=22,
                 categorical_features=None,
                 continuous_features=None,
                 interaction_features=None,
                 store_fmap=False):
        if n_bits < 1 or n_bits > 31:
            raise ValueError("number of bits must be in interval [1, 31]")

        self.n_bits_ = n_bits
        self.categorical_features_ = set(categorical_features or [])
        self.continuous_features_ = set(continuous_features or [])
        self.interaction_features_ = set(interaction_features or [])
        self.store_fmap = store_fmap

        n_features = len(self.categorical_features_) + \
                     len(self.continuous_features_) + \
                     len(self.interaction_features_)
        if n_features == 0:
            raise ValueError("at least one features needs to be specified")

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return _transform(
            X, self.n_bits_, self.categorical_features_,
            self.continuous_features_, self.interaction_features_,
            self.store_fmap
        )

In [10]:
def convert_to_unknown(df, columns):
    """
    convert NA to 'UNK'
    """
    if isinstance(df, pd.DataFrame):
        for col in columns:
            df[col].fillna("UNK", inplace=True)

    if isinstance(df, dict):
        for col in columns:
            df[col][pd.isnull(df[col])] = "UNK"
    return df
          
    
def fillna0(df, columns):
  """
  fill NA with 0
  """
  if isinstance(df, pd.DataFrame):
      for col in columns:
          df[col].fillna(0, inplace=True)

  if isinstance(df, dict):
      for col in columns:
          df[col][pd.isnull(df[col])] = 0    
  return df


def set_column_types(df, column_types_dict):
  if isinstance(column_types_dict, dict):
    for c, t in column_types_dict.items():
      df[c] = df[c].astype(t)
    return df 
  else:
    raise TypeError()

def preprocessing_data(df):
    """
    clean data
    """
    columns = {'is_install':int}
    df = set_column_types(df, columns)
    df = convert_to_unknown(df, ['advertiser_app_store_id',
                            'publisher_app_store_id',
                            'country_code'
                            ])
    df = fillna0(df, columns.keys())
    
    return df
  
class FeatureCreator():
    """Augment DataFrame-like input with new features."""

    def transform(self, X, inplace=False):
        # TODO probably it's a good idea to restrict what fields from the
        #      DataFrame are used to avoid copying the whole thing
        if isinstance(X, pd.DataFrame):
            X = {k: v.values for k, v in X.iteritems()}
        if not inplace:
            X = {k: np.copy(v) for k, v in X.iteritems()}
        return X

    def fit(self, X, y=None):
        return self

In [11]:
def logistic_model(X_train, y_train):
  """
  use crossvalidation (CV) to report the best parameter 'C'
  parameter C: Inverse of regularization strength; must be a positive float. 
  Check LogisticRegression() in sklearn for more information
  """
  print('Train Regression Model')
  model = GridSearchCV(
        estimator=LogisticRegression(),
        param_grid={'C': [0.01, 0.02, 0.05, 0.1, 0.2, 0.5]},
        scoring='log_loss',
        cv=5
  )
  model.fit(X_train, y_train)
  return model

In [None]:
cat_features_pc = [
        'advertiser_app_store_id',
        'publisher_app_store_id',
        'country_code',
    ]

cont_features_pc = []

cat_interactions_pc = [
     ('advertiser_app_store_id', 'publisher_app_store_id', 'country_code'),
     ('advertiser_app_store_id', 'publisher_app_store_id'),
     ('advertiser_app_store_id', 'country_code'),
     ('publisher_app_store_id', 'country_code')
]

In [None]:
X_clf = preprocessing_data(train_df)

train, test = train_test_split(X_clf, test_size=0.2, random_state = 27)  # splitting Train into Train/Test sets

X_train, y_train, X_test, y_test = train, train.pop('is_install'), test, test.pop('is_install')

# Feature Hashing
print('Feature Creating/Hashing Train')
feature_creator = FeatureCreator()
design_matrix_transformer = FeatureHasher(
    18, cat_features_pc, None, cat_interactions_pc, store_fmap=True) # You can experiment with hasher bits (we used 18 here)

X_test = feature_creator.transform(X_test, inplace=True)
X_test, f_map = design_matrix_transformer.fit_transform(X_test)

X_train = feature_creator.transform(X_train, inplace=True)
X_train, f_map = design_matrix_transformer.fit_transform(X_train)

Feature Creating/Hashing Train
2028645017
-945497935
-1220518715
-1136183763
1468332803
-189799199
-1478044106
-1480841591
868247699
-1713266108
-1437344270
-312642478
1873518222
2106800981
-570458961
-1838963343
338968083
-1838963343
-1753902533
511514109
-1281742136
-215998163
730787998
868247699
1399210790
-2132517483
774062694
1516778650
107270415
1428005501
-1002527555
-2017226579
-1040996731
1677655009
-1634440179
-790028076
1859124250
179846835
810770184
1121142818
-1634440179
1265207244
-586427618
-442772646
-800668659
1387454744
-1680841016
216049994
-1765942307
-2098891667
1777685532
-570458961
-1340221634
-1634440179
-1041406742
37203278
-480512099
-1288519915
673477442
1314240173
-2041421735
-955777155
-1319963736
-772342162
-964178344
-1156441619
797488448
426159905
1314240173
-738496288
-1073377369
52861051
509239597
765594044
-1348406476
1369101266
-573853669
1314240173
398201248
99567163
-197377315
-581948027
698668804
418682460
-937686526
810770184
-1304593020
-4745528

In [None]:
logistic_baseline = logistic_model(X_train, y_train)

# Calculate prediction/probability of train and test
X_train_predictions = logistic_baseline.predict(X_train)
X_train_predprob = logistic_baseline.predict_proba(X_train)[:, 1]

X_test_predictions = logistic_baseline.predict(X_test)
X_test_predprob = logistic_baseline.predict_proba(X_test)[:, 1]

# Calculate metrics of train, validation and test set.
lr_ll_val = -logistic_baseline.best_score_

lr_ll_train = log_loss(y_train, X_train_predprob)
lr_auc_train = roc_auc_score(y_train, X_train_predprob)

lr_ll_test = log_loss(y_test, X_test_predprob)    
lr_auc_test = roc_auc_score(y_test, X_test_predprob)

# Print out the results
print "Best parameter: ", logistic_baseline.best_params_

print "Log Loss (Validation): %f" % lr_ll_val

print "Log Loss (Train): %f" % lr_ll_train
print "AUC (Train): %f" % lr_auc_train

print 'Log Loss (Test): %f' % lr_ll_test
print 'AUC (Test): %f' % lr_auc_test