In [60]:
import pandas as pd
import numpy as np
from pyspark.sql import Row
from pyspark.sql.types import *
import pyspark.sql.functions as F
from datetime import datetime, date
from collections import defaultdict

### Skip this cell if you already have a chunk of the transactions data.

In [61]:
file_name = '../data/transactions.csv'
chunksize = 10 ** 5
chunk_iter = pd.read_csv(file_name, chunksize=chunksize)
chunk1 = chunk_iter.next()
chunk1.to_csv('../data/transactions_chunk1.csv', index=False)

In [62]:
lines = sc.textFile('../data/transactions_chunk1.csv')

In [63]:
header = lines.first()
lines = lines.filter(lambda row: row != header)

In [64]:
# Functions for parsing the strings from the CSV file

def toIntSafe(inval):
  try:
    return int(inval)
  except ValueError:
    return None

def toTimeSafe(inval):
  try:
    return datetime.strptime(inval, "%Y-%m-%d")
  except ValueError:
    return None

def toLongSafe(inval):
  try:
    return long(inval)
  except ValueError:
    return None

def toFloatSafe(inval):
  try:
    return float(inval)
  except ValueError:
    return None
    
def stringToPost(row):
  r = row.encode('utf8').split(',')
  return Row(
    int(r[0]),         # Don't want this column to be nullable
    toLongSafe(r[1]),
    toLongSafe(r[2]),
    toLongSafe(r[3]),
    toLongSafe(r[4]),
    toLongSafe(r[5]),
    toTimeSafe(r[6]),
    toFloatSafe(r[7]),
    r[8],
    toLongSafe(r[9]),
    toFloatSafe(r[10]))  

In [65]:
transactions_schema = StructType([
    StructField("id", LongType(), False),
    StructField("chain", LongType(), True),
    StructField("dept", LongType(), True),
    StructField("category", LongType(), True),
    StructField("company", LongType(), True),
    StructField("brand", LongType(), True),
    StructField("date", DateType(), True),
    StructField("productsize", DoubleType(), True),
    StructField("purchasemeasure", StringType(), True),
    StructField("purchasequantity", LongType(), True),
    StructField("purchaseamount", DoubleType(), True)
    ])

In [66]:
rowRDD = lines.map(lambda p: stringToPost(p))

In [67]:
transactions = sqlContext.createDataFrame(rowRDD, transactions_schema)

In [68]:
transactions.write.saveAsTable('transactions')

AnalysisException: u'Table `transactions` already exists.;'

In [69]:
transactions.printSchema()

root
 |-- id: long (nullable = false)
 |-- chain: long (nullable = true)
 |-- dept: long (nullable = true)
 |-- category: long (nullable = true)
 |-- company: long (nullable = true)
 |-- brand: long (nullable = true)
 |-- date: date (nullable = true)
 |-- productsize: double (nullable = true)
 |-- purchasemeasure: string (nullable = true)
 |-- purchasequantity: long (nullable = true)
 |-- purchaseamount: double (nullable = true)



In [70]:
transactions.show()

+-----+-----+----+--------+----------+-----+----------+-----------+---------------+----------------+--------------+
|   id|chain|dept|category|   company|brand|      date|productsize|purchasemeasure|purchasequantity|purchaseamount|
+-----+-----+----+--------+----------+-----+----------+-----------+---------------+----------------+--------------+
|86246|  205|   7|     707|1078778070|12564|2012-03-02|       12.0|             OZ|               1|          7.59|
|86246|  205|  63|    6319| 107654575|17876|2012-03-02|       64.0|             OZ|               1|          1.59|
|86246|  205|  97|    9753|1022027929|    0|2012-03-02|        1.0|             CT|               1|          5.99|
|86246|  205|  25|    2509| 107996777|31373|2012-03-02|       16.0|             OZ|               1|          1.99|
|86246|  205|  55|    5555| 107684070|32094|2012-03-02|       16.0|             OZ|               2|         10.38|
|86246|  205|  97|    9753|1021015020|    0|2012-03-02|        1.0|     

Get number of refund transactions

In [71]:
import pyspark.sql.functions as F

returns = transactions.select('id',
                              F.when(transactions['purchaseamount'] < 0, 1).otherwise(0).alias('return'))\
                      .withColumn('1', F.lit(1))
returns = returns.groupBy('id').agg(F.sum('1').alias('total_trans'), F.sum("return").alias('total_returns'))
returns.show()

+--------+-----------+-------------+
|      id|total_trans|total_returns|
+--------+-----------+-------------+
|18470775|        350|            8|
|14723452|        755|            5|
|15738658|         39|            0|
|17552659|        591|            4|
|12996040|        326|            5|
|16078766|        966|           54|
|18249735|       1557|           60|
|14989775|        614|           39|
|15073302|        526|           38|
|16075389|        591|           32|
|16606739|        678|            6|
|15705695|        431|            8|
|14576147|        817|           66|
|15134033|        944|           13|
|16551772|       1699|           47|
|17652157|       1407|           15|
|13089312|       1218|           52|
|13744500|       2232|          112|
|16829614|        738|           10|
|17524817|        328|            7|
+--------+-----------+-------------+
only showing top 20 rows



# Training data and testing data

In [72]:
train = pd.read_csv('../data/trainHistory.csv')
offers = pd.read_csv('../data/offers.csv')

In [73]:
offers_dict = {}
for e, line in enumerate(open('../data/offers.csv')):
    row = line.strip().split(",")
    offers_dict[row[0]] = row

#keep two dictionaries with the shopper id's from test and train
train_ids = {}
test_ids = {}
for e, line in enumerate(open('../data/trainHistory.csv')):
    if e > 0:
        row = line.strip().split(",")
        train_ids[row[0]] = row
for e, line in enumerate(open('../data/testHistory.csv')):
    if e > 0:
        row = line.strip().split(",")
        test_ids[row[0]] = row


In [74]:
offers.head()

Unnamed: 0,offer,category,quantity,company,offervalue,brand
0,1190530,9115,1,108500080,5.0,93904
1,1194044,9909,1,107127979,1.0,6732
2,1197502,3203,1,106414464,0.75,13474
3,1198271,5558,1,107120272,1.5,5072
4,1198272,5558,1,107120272,1.5,5072


In [75]:
# Add department to offers
dept = []
for i in range(offers.shape[0]):
    str_category = str(offers['category'].iloc[i])
    if len(str_category) == 4:
        dept.append(str_category[:2])
    else:
        dept.append(str_category[:1])
        
offers['dept'] = dept
offers['dept'] = pd.to_numeric(offers['dept'])

In [76]:
train = pd.merge(train, offers, how='left', on=['offer'])

In [77]:
# Rename columns
train.columns = ['id', 'chain', 'offer', 'market', 'repeattrips', 'repeater', 'offerdate', 
                 'offer_category', 'quantity', 'offer_company', 'offervalue', 'offer_brand',
                 'offer_dept']

In [78]:
# Reencode the target variable
train['repeater'] = np.where(train['repeater'] == 't', '1', '0')
train['repeater'] = pd.to_numeric(train['repeater'])

In [79]:
train['offerdate'] = train['offerdate'].astype(str)

In [80]:
test = pd.read_csv('../data/testHistory.csv')
test = pd.merge(test, offers, how='left', on=['offer'])

In [81]:
# Rename columns
test.columns = ['id', 'chain', 'offer', 'market', 'offerdate', 'offer_category', 'quantity', 
                'offer_company', 'offervalue', 'offer_brand', 'offer_dept']

In [82]:
test['offerdate'] = test['offerdate'].astype(str)

In [83]:
train_schema = StructType([
    StructField("id", LongType(), False),
    StructField("offer_chain", LongType(), True),
    StructField("offer", LongType(), True),
    StructField("market", LongType(), True),
    StructField("repeattrips", LongType(), True),
    StructField("repeater", IntegerType(), True),
    StructField("offerdate", StringType(), True),
    StructField("offer_category", LongType(), True),
    StructField("quantity", LongType(), True),
    StructField("offer_company", LongType(), True),
    StructField("offervalue", DoubleType(), True),
    StructField("offer_brand", LongType(), True),
    StructField("offer_dept", LongType(), True)
    ])

In [84]:
train_df = sqlContext.createDataFrame(train, train_schema)

In [85]:
test_schema = StructType([
    StructField("id", LongType(), False),
    StructField("offer_chain", LongType(), True),
    StructField("offer", LongType(), True),
    StructField("market", LongType(), True),
    StructField("offerdate", StringType(), True),
    StructField("offer_category", LongType(), True),
    StructField("quantity", LongType(), True),
    StructField("offer_company", LongType(), True),
    StructField("offervalue", DoubleType(), True),
    StructField("offer_brand", LongType(), True),
    StructField("offer_dept", LongType(), True)
    ])

In [86]:
test_df = sqlContext.createDataFrame(test, test_schema)

In [87]:
# This function converts the string cell into a date:
stringToDate = F.udf(lambda x: datetime.strptime(x, '%Y-%m-%d'), DateType())

In [88]:
# Convert offerdate to date type
train_df = train_df.withColumn('offerdate', stringToDate(F.col('offerdate')))
test_df = test_df.withColumn('offerdate', stringToDate(F.col('offerdate')))

In [89]:
offer_df = train_df.select('offer').union(test_df.select('offer'))

In [90]:
offer_df.take(5)

[Row(offer=1208251),
 Row(offer=1197502),
 Row(offer=1197502),
 Row(offer=1197502),
 Row(offer=1204821)]

In [91]:
def days_diff(d1,d2, date_format = "%Y-%m-%d"):
    return (datetime.strptime(d2, date_format) - datetime.strptime(d1, date_format)).days

def dow(d1, date_format = "%Y-%m-%d"):
    return datetime.strptime(d1, date_format).weekday()

def dom(d1, date_format = "%Y-%m-%d"):
    return datetime.strptime(d1, date_format).day

def month(d1, date_format = "%Y-%m-%d"):
    return datetime.strptime(d1, date_format).month

In [92]:
# print month("2013-04-24")

In [93]:
train_df.take(5)

[Row(id=86246, offer_chain=205, offer=1208251, market=34, repeattrips=5, repeater=1, offerdate=datetime.date(2013, 4, 24), offer_category=2202, quantity=1, offer_company=104460040, offervalue=2.0, offer_brand=3718, offer_dept=22),
 Row(id=86252, offer_chain=205, offer=1197502, market=34, repeattrips=16, repeater=1, offerdate=datetime.date(2013, 3, 27), offer_category=3203, quantity=1, offer_company=106414464, offervalue=0.75, offer_brand=13474, offer_dept=32),
 Row(id=12682470, offer_chain=18, offer=1197502, market=11, repeattrips=0, repeater=0, offerdate=datetime.date(2013, 3, 28), offer_category=3203, quantity=1, offer_company=106414464, offervalue=0.75, offer_brand=13474, offer_dept=32),
 Row(id=12996040, offer_chain=15, offer=1197502, market=9, repeattrips=0, repeater=0, offerdate=datetime.date(2013, 3, 25), offer_category=3203, quantity=1, offer_company=106414464, offervalue=0.75, offer_brand=13474, offer_dept=32),
 Row(id=13089312, offer_chain=15, offer=1204821, market=9, repeatt

In [94]:
##### THE FOLLOWING CODE FOR BUILDING THE FEATURES IS DERIVED FROM THIS KAGGLE FORUM POST: https://www.kaggle.com/c/acquire-valued-shoppers-challenge/forums/t/7688/feature-engineering-and-beat-the-benchmark-0-59347

loc_offers = "../data/offers.csv"
loc_transactions = "../data/transactions.csv"
loc_train = "../data/trainHistory.csv"
loc_test = "../data/testHistory.csv"

# will be created
loc_reduced = "../data/reduced.csv" 
loc_out_train = "../data/train.csv"
loc_out_test = "../data/test.csv"

In [95]:
def reduce_data(loc_offers, loc_transactions, loc_reduced):
  start = datetime.now()
  #get all categories and comps on offer in a dict
  offers_cat = {}
  offers_co = {}
  for e, line in enumerate( open(loc_offers) ):
    offers_cat[ line.split(",")[1] ] = 1
    offers_co[ line.split(",")[3] ] = 1
  #open output file
  with open(loc_reduced, "wb") as outfile:
    #go through transactions file and reduce
    reduced = 0
    for e, line in enumerate( open(loc_transactions) ):
      if e == 0:
        outfile.write( line ) #print header
      else:
        #only write when if category in offers dict
        if line.split(",")[3] in offers_cat or line.split(",")[4] in offers_co:
          outfile.write( line )
          reduced += 1
      #progress
      if e % 5000000 == 0:
        print e, reduced, datetime.now() - start
  print e, reduced, datetime.now() - start

In [96]:
#reduce_data(loc_offers, loc_transactions, loc_reduced)

In [97]:
def generate_features(loc_train, loc_test, loc_transactions, loc_out_train, loc_out_test):
    with open(loc_out_train, "wb") as out_train, open(loc_out_test, "wb") as out_test:
        #iterate through reduced dataset 
        last_id = 0
        features = defaultdict(float)
        for e, line in enumerate(open(loc_transactions)):
            if e > 0: #skip header
                #poor man's csv reader
                row = line.strip().split(",")
                #write away the features when we get to a new shopper id
                if last_id != row[0] and e != 1:

                    #generate negative features
                    if "has_bought_company" not in features:
                        features['never_bought_company'] = 1

                    if "has_bought_category" not in features:
                        features['never_bought_category'] = 1

                    if "has_bought_brand" not in features:
                        features['never_bought_brand'] = 1

                    if "has_bought_brand" in features and "has_bought_category" in features and "has_bought_company" in features:
                        features['has_bought_brand_company_category'] = 1

                    if "has_bought_brand" in features and "has_bought_category" in features:
                        features['has_bought_brand_category'] = 1

                    if "has_bought_brand" in features and "has_bought_company" in features:
                        features['has_bought_brand_company'] = 1

                    test = False
                    for k, v in features.items():

                        if k == "label" and v == 0.5:
                            #test
                            test = True
                    
                    if test:
                        features["id"] = last_id
                        out_test.write(str(features))
                    else:
                        features["id"] = last_id
                        out_train.write(str(features))
                    #print "Writing features or storing them in an array"
                    #reset features
                    features = defaultdict(float)
                #generate features from transaction record
                #check if we have a test sample or train sample
                if row[0] in train_ids or row[0] in test_ids:
                    #generate label and history
                    if row[0] in train_ids:
                        history = train_ids[row[0]]
                        if train_ids[row[0]][5] == "t":
                            features['label'] = 1
                        else:
                            features['label'] = 0
                    else:
                        history = test_ids[row[0]]
                        features['label'] = 0.5

                    #print "label", label 
                    #print "trainhistory", train_ids[row[0]]
                    #print "transaction", row
                    #print "offers", offers_dict[ train_ids[row[0]][2] ]
                    #print

                    features['offer_value'] = offers_dict[ history[2] ][4]
                    features['offer_quantity'] = offers_dict[ history[2] ][2]
                    offervalue = offers_dict[ history[2] ][4]

                    features['total_spend'] += float( row[10] )
                    
                    features['day_of_week'] = dow(history[-1])
                    features['day_of_month'] = dom(history[-1])
                    features['month'] = month(history[-1])

                    if offers_dict[ history[2] ][3] == row[4]:
                        features['has_bought_company'] += 1.0
                        features['has_bought_company_q'] += float( row[9] )
                        features['has_bought_company_a'] += float( row[10] )
                        days_diff
                        date_diff_days = days_diff(row[6],history[-1])
                        if date_diff_days < 30:
                            features['has_bought_company_30'] += 1.0
                            features['has_bought_company_q_30'] += float( row[9] )
                            features['has_bought_company_a_30'] += float( row[10] )
                        if date_diff_days < 60:
                            features['has_bought_company_60'] += 1.0
                            features['has_bought_company_q_60'] += float( row[9] )
                            features['has_bought_company_a_60'] += float( row[10] )
                        if date_diff_days < 90:
                            features['has_bought_company_90'] += 1.0
                            features['has_bought_company_q_90'] += float( row[9] )
                            features['has_bought_company_a_90'] += float( row[10] )
                        if date_diff_days < 180:
                            features['has_bought_company_180'] += 1.0
                            features['has_bought_company_q_180'] += float( row[9] )
                            features['has_bought_company_a_180'] += float( row[10] )

                    if offers_dict[ history[2] ][1] == row[3]:

                        features['has_bought_category'] += 1.0
                        features['has_bought_category_q'] += float( row[9] )
                        features['has_bought_category_a'] += float( row[10] )
                        date_diff_days = days_diff(row[6],history[-1])
                        if date_diff_days < 30:
                            features['has_bought_category_30'] += 1.0
                            features['has_bought_category_q_30'] += float( row[9] )
                            features['has_bought_category_a_30'] += float( row[10] )
                        if date_diff_days < 60:
                            features['has_bought_category_60'] += 1.0
                            features['has_bought_category_q_60'] += float( row[9] )
                            features['has_bought_category_a_60'] += float( row[10] )
                        if date_diff_days < 90:
                            features['has_bought_category_90'] += 1.0
                            features['has_bought_category_q_90'] += float( row[9] )
                            features['has_bought_category_a_90'] += float( row[10] )						
                        if date_diff_days < 180:
                            features['has_bought_category_180'] += 1.0
                            features['has_bought_category_q_180'] += float( row[9] )
                            features['has_bought_category_a_180'] += float( row[10] )				
                    if offers_dict[ history[2] ][5] == row[5]:
                        features['has_bought_brand'] += 1.0
                        features['has_bought_brand_q'] += float( row[9] )
                        features['has_bought_brand_a'] += float( row[10] )
                        date_diff_days = days_diff(row[6],history[-1])
                        if date_diff_days < 30:
                            features['has_bought_brand_30'] += 1.0
                            features['has_bought_brand_q_30'] += float( row[9] )
                            features['has_bought_brand_a_30'] += float( row[10] )
                        if date_diff_days < 60:
                            features['has_bought_brand_60'] += 1.0
                            features['has_bought_brand_q_60'] += float( row[9] )
                            features['has_bought_brand_a_60'] += float( row[10] )
                        if date_diff_days < 90:
                            features['has_bought_brand_90'] += 1.0
                            features['has_bought_brand_q_90'] += float( row[9] )
                            features['has_bought_brand_a_90'] += float( row[10] )						
                        if date_diff_days < 180:
                            features['has_bought_brand_180'] += 1.0
                            features['has_bought_brand_q_180'] += float( row[9] )
                            features['has_bought_brand_a_180'] += float( row[10] )	
                last_id = row[0]
                if e % 100000 == 0:
                    print e


In [98]:
train_df = pd.DataFrame(columns=('id', 'label', 'offer_value', 'offer_quantity', 'total_spend', 'day_of_week', 'day_of_month', 'month', 'has_bought_company', 'has_bought_company_q', 'has_bought_company_a', 'has_bought_company_30', 'has_bought_company_q_30', 'has_bought_company_a_30', 'has_bought_company_60', 'has_bought_company_q_60', 'has_bought_company_a_60', 'has_bought_company_90', 'has_bought_company_q_90', 'has_bought_company_a_90', 'has_bought_company_180', 'has_bought_company_q_180', 'has_bought_company_a_180', 'has_bought_category', 'has_bought_category_q', 'has_bought_category_a', 'has_bought_category_30', 'has_bought_category_q_30', 'has_bought_category_a_30', 'has_bought_category_60', 'has_bought_category_q_60', 'has_bought_category_a_60', 'has_bought_category_90', 'has_bought_category_q_90', 'has_bought_category_a_90', 'has_bought_category_180', 'has_bought_category_q_180', 'has_bought_category_a_180', 'has_bought_brand', 'has_bought_brand_q', 'has_bought_brand_a', 'has_bought_brand_30', 'has_bought_brand_q_30', 'has_bought_brand_a_30', 'has_bought_brand_60', 'has_bought_brand_q_60', 'has_bought_brand_a_60', 'has_bought_brand_90', 'has_bought_brand_q_90', 'has_bought_brand_a_90', 
'has_bought_brand_180', 'has_bought_brand_q_180', 'has_bought_brand_a_180', 'never_bought_company', 'never_bought_category', 'never_bought_brand', 'has_bought_brand_company_category', 'has_bought_brand_category', 'has_bought_brand_company'))

In [99]:
test_df = pd.DataFrame(columns=('id', 'label', 'offer_value', 'offer_quantity', 'total_spend', 'day_of_week', 'day_of_month', 'month', 'has_bought_company', 'has_bought_company_q', 'has_bought_company_a', 'has_bought_company_30', 'has_bought_company_q_30', 'has_bought_company_a_30', 'has_bought_company_60', 'has_bought_company_q_60', 'has_bought_company_a_60', 'has_bought_company_90', 'has_bought_company_q_90', 'has_bought_company_a_90', 'has_bought_company_180', 'has_bought_company_q_180', 'has_bought_company_a_180', 'has_bought_category', 'has_bought_category_q', 'has_bought_category_a', 'has_bought_category_30', 'has_bought_category_q_30', 'has_bought_category_a_30', 'has_bought_category_60', 'has_bought_category_q_60', 'has_bought_category_a_60', 'has_bought_category_90', 'has_bought_category_q_90', 'has_bought_category_a_90', 'has_bought_category_180', 'has_bought_category_q_180', 'has_bought_category_a_180', 'has_bought_brand', 'has_bought_brand_q', 'has_bought_brand_a', 'has_bought_brand_30', 'has_bought_brand_q_30', 'has_bought_brand_a_30', 'has_bought_brand_60', 'has_bought_brand_q_60', 'has_bought_brand_a_60', 'has_bought_brand_90', 'has_bought_brand_q_90', 'has_bought_brand_a_90', 
'has_bought_brand_180', 'has_bought_brand_q_180', 'has_bought_brand_a_180', 'never_bought_company', 'never_bought_category', 'never_bought_brand', 'has_bought_brand_company_category', 'has_bought_brand_category', 'has_bought_brand_company'))

In [100]:
test_df.shape

(0, 59)

In [101]:
train_df.columns

Index([u'id', u'label', u'offer_value', u'offer_quantity', u'total_spend',
       u'day_of_week', u'day_of_month', u'month', u'has_bought_company',
       u'has_bought_company_q', u'has_bought_company_a',
       u'has_bought_company_30', u'has_bought_company_q_30',
       u'has_bought_company_a_30', u'has_bought_company_60',
       u'has_bought_company_q_60', u'has_bought_company_a_60',
       u'has_bought_company_90', u'has_bought_company_q_90',
       u'has_bought_company_a_90', u'has_bought_company_180',
       u'has_bought_company_q_180', u'has_bought_company_a_180',
       u'has_bought_category', u'has_bought_category_q',
       u'has_bought_category_a', u'has_bought_category_30',
       u'has_bought_category_q_30', u'has_bought_category_a_30',
       u'has_bought_category_60', u'has_bought_category_q_60',
       u'has_bought_category_a_60', u'has_bought_category_90',
       u'has_bought_category_q_90', u'has_bought_category_a_90',
       u'has_bought_category_180', u'has_bought_c

In [102]:
# [safefetch(features, column) for column in df.columns]

In [103]:
def safefetch(features, column):
    if column in features:
        return features[column]
    else:
        return 0

In [104]:
train_list = []
test_list = []

In [105]:
def generate_features(loc_train, loc_test, loc_transactions, loc_out_train, loc_out_test, train_list = train_list, test_list = test_list):
#     with open(loc_out_train, "wb") as out_train, open(loc_out_test, "wb") as out_test:
        #iterate through reduced dataset
        columns = ['id', 'label', 'offer_value', 'offer_quantity', 'total_spend', 'day_of_week', 'day_of_month', 'month', 'has_bought_company', 'has_bought_company_q', 'has_bought_company_a', 'has_bought_company_30', 'has_bought_company_q_30', 'has_bought_company_a_30', 'has_bought_company_60', 'has_bought_company_q_60', 'has_bought_company_a_60', 'has_bought_company_90', 'has_bought_company_q_90', 'has_bought_company_a_90', 'has_bought_company_180', 'has_bought_company_q_180', 'has_bought_company_a_180', 'has_bought_category', 'has_bought_category_q', 'has_bought_category_a', 'has_bought_category_30', 'has_bought_category_q_30', 'has_bought_category_a_30', 'has_bought_category_60', 'has_bought_category_q_60', 'has_bought_category_a_60', 'has_bought_category_90', 'has_bought_category_q_90', 'has_bought_category_a_90', 'has_bought_category_180', 'has_bought_category_q_180', 'has_bought_category_a_180', 'has_bought_brand', 'has_bought_brand_q', 'has_bought_brand_a', 'has_bought_brand_30', 'has_bought_brand_q_30', 'has_bought_brand_a_30', 'has_bought_brand_60', 'has_bought_brand_q_60', 'has_bought_brand_a_60', 'has_bought_brand_90', 'has_bought_brand_q_90', 'has_bought_brand_a_90', 
'has_bought_brand_180', 'has_bought_brand_q_180', 'has_bought_brand_a_180', 'never_bought_company', 'never_bought_category', 'never_bought_brand', 'has_bought_brand_company_category', 'has_bought_brand_category', 'has_bought_brand_company']
        last_id = 0
        features = defaultdict(float)
        for e, line in enumerate(open(loc_transactions)):
            if e > 0: #skip header
                #poor man's csv reader
                row = line.strip().split(",")
                #write away the features when we get to a new shopper id
                if last_id != row[0] and e != 1:

                    #generate negative features
#                     print train_df
                    if "has_bought_company" not in features:
                        features['never_bought_company'] = 1

                    if "has_bought_category" not in features:
                        features['never_bought_category'] = 1

                    if "has_bought_brand" not in features:
                        features['never_bought_brand'] = 1

                    if "has_bought_brand" in features and "has_bought_category" in features and "has_bought_company" in features:
                        features['has_bought_brand_company_category'] = 1

                    if "has_bought_brand" in features and "has_bought_category" in features:
                        features['has_bought_brand_category'] = 1

                    if "has_bought_brand" in features and "has_bought_company" in features:
                        features['has_bought_brand_company'] = 1

                    test = False
                    for k, v in features.items():

                        if k == "label" and v == 0.5:
                            #test
                            test = True
                    
                    if test:
                        features["id"] = last_id
                        test_list.append([safefetch(features, column) for column in columns])
                    else:
                        features["id"] = last_id
                        train_list.append([safefetch(features, column) for column in columns])
                    #print "Writing features or storing them in an array"
                    #reset features
                    features = defaultdict(float)
                #generate features from transaction record
                #check if we have a test sample or train sample
                if row[0] in train_ids or row[0] in test_ids:
                    #generate label and history
                    if row[0] in train_ids:
                        history = train_ids[row[0]]
                        if train_ids[row[0]][5] == "t":
                            features['label'] = 1
                        else:
                            features['label'] = 0
                    else:
                        history = test_ids[row[0]]
                        features['label'] = 0.5

                    #print "label", label 
                    #print "trainhistory", train_ids[row[0]]
                    #print "transaction", row
                    #print "offers", offers_dict[ train_ids[row[0]][2] ]
                    #print

                    features['offer_value'] = offers_dict[ history[2] ][4]
                    features['offer_quantity'] = offers_dict[ history[2] ][2]
                    offervalue = offers_dict[ history[2] ][4]

                    features['total_spend'] += float( row[10] )
                    
                    features['day_of_week'] = dow(history[-1])
                    features['day_of_month'] = dom(history[-1])
                    features['month'] = month(history[-1])

                    if offers_dict[ history[2] ][3] == row[4]:
                        features['has_bought_company'] += 1.0
                        features['has_bought_company_q'] += float( row[9] )
                        features['has_bought_company_a'] += float( row[10] )
                        days_diff
                        date_diff_days = days_diff(row[6],history[-1])
                        if date_diff_days < 30:
                            features['has_bought_company_30'] += 1.0
                            features['has_bought_company_q_30'] += float( row[9] )
                            features['has_bought_company_a_30'] += float( row[10] )
                        if date_diff_days < 60:
                            features['has_bought_company_60'] += 1.0
                            features['has_bought_company_q_60'] += float( row[9] )
                            features['has_bought_company_a_60'] += float( row[10] )
                        if date_diff_days < 90:
                            features['has_bought_company_90'] += 1.0
                            features['has_bought_company_q_90'] += float( row[9] )
                            features['has_bought_company_a_90'] += float( row[10] )
                        if date_diff_days < 180:
                            features['has_bought_company_180'] += 1.0
                            features['has_bought_company_q_180'] += float( row[9] )
                            features['has_bought_company_a_180'] += float( row[10] )

                    if offers_dict[ history[2] ][1] == row[3]:

                        features['has_bought_category'] += 1.0
                        features['has_bought_category_q'] += float( row[9] )
                        features['has_bought_category_a'] += float( row[10] )
                        date_diff_days = days_diff(row[6],history[-1])
                        if date_diff_days < 30:
                            features['has_bought_category_30'] += 1.0
                            features['has_bought_category_q_30'] += float( row[9] )
                            features['has_bought_category_a_30'] += float( row[10] )
                        if date_diff_days < 60:
                            features['has_bought_category_60'] += 1.0
                            features['has_bought_category_q_60'] += float( row[9] )
                            features['has_bought_category_a_60'] += float( row[10] )
                        if date_diff_days < 90:
                            features['has_bought_category_90'] += 1.0
                            features['has_bought_category_q_90'] += float( row[9] )
                            features['has_bought_category_a_90'] += float( row[10] )						
                        if date_diff_days < 180:
                            features['has_bought_category_180'] += 1.0
                            features['has_bought_category_q_180'] += float( row[9] )
                            features['has_bought_category_a_180'] += float( row[10] )				
                    if offers_dict[ history[2] ][5] == row[5]:
                        features['has_bought_brand'] += 1.0
                        features['has_bought_brand_q'] += float( row[9] )
                        features['has_bought_brand_a'] += float( row[10] )
                        date_diff_days = days_diff(row[6],history[-1])
                        if date_diff_days < 30:
                            features['has_bought_brand_30'] += 1.0
                            features['has_bought_brand_q_30'] += float( row[9] )
                            features['has_bought_brand_a_30'] += float( row[10] )
                        if date_diff_days < 60:
                            features['has_bought_brand_60'] += 1.0
                            features['has_bought_brand_q_60'] += float( row[9] )
                            features['has_bought_brand_a_60'] += float( row[10] )
                        if date_diff_days < 90:
                            features['has_bought_brand_90'] += 1.0
                            features['has_bought_brand_q_90'] += float( row[9] )
                            features['has_bought_brand_a_90'] += float( row[10] )						
                        if date_diff_days < 180:
                            features['has_bought_brand_180'] += 1.0
                            features['has_bought_brand_q_180'] += float( row[9] )
                            features['has_bought_brand_a_180'] += float( row[10] )	
                last_id = row[0]
                if e % 100000 == 0:
                    print e


In [106]:
columns = ['id', 'label', 'offer_value', 'offer_quantity', 'total_spend', 'day_of_week', 'day_of_month', 'month', 'has_bought_company', 'has_bought_company_q', 'has_bought_company_a', 'has_bought_company_30', 'has_bought_company_q_30', 'has_bought_company_a_30', 'has_bought_company_60', 'has_bought_company_q_60', 'has_bought_company_a_60', 'has_bought_company_90', 'has_bought_company_q_90', 'has_bought_company_a_90', 'has_bought_company_180', 'has_bought_company_q_180', 'has_bought_company_a_180', 'has_bought_category', 'has_bought_category_q', 'has_bought_category_a', 'has_bought_category_30', 'has_bought_category_q_30', 'has_bought_category_a_30', 'has_bought_category_60', 'has_bought_category_q_60', 'has_bought_category_a_60', 'has_bought_category_90', 'has_bought_category_q_90', 'has_bought_category_a_90', 'has_bought_category_180', 'has_bought_category_q_180', 'has_bought_category_a_180', 'has_bought_brand', 'has_bought_brand_q', 'has_bought_brand_a', 'has_bought_brand_30', 'has_bought_brand_q_30', 'has_bought_brand_a_30', 'has_bought_brand_60', 'has_bought_brand_q_60', 'has_bought_brand_a_60', 'has_bought_brand_90', 'has_bought_brand_q_90', 'has_bought_brand_a_90', 
'has_bought_brand_180', 'has_bought_brand_q_180', 'has_bought_brand_a_180', 'never_bought_company', 'never_bought_category', 'never_bought_brand', 'has_bought_brand_company_category', 'has_bought_brand_category', 'has_bought_brand_company']

In [None]:
generate_features(loc_train, loc_test, "../data/reduced.csv", loc_out_train, loc_out_test)

100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000


In [None]:
columns = ['id', 'label', 'offer_value', 'offer_quantity', 'total_spend', 'day_of_week', 'day_of_month', 'month', 'has_bought_company', 'has_bought_company_q', 'has_bought_company_a', 'has_bought_company_30', 'has_bought_company_q_30', 'has_bought_company_a_30', 'has_bought_company_60', 'has_bought_company_q_60', 'has_bought_company_a_60', 'has_bought_company_90', 'has_bought_company_q_90', 'has_bought_company_a_90', 'has_bought_company_180', 'has_bought_company_q_180', 'has_bought_company_a_180', 'has_bought_category', 'has_bought_category_q', 'has_bought_category_a', 'has_bought_category_30', 'has_bought_category_q_30', 'has_bought_category_a_30', 'has_bought_category_60', 'has_bought_category_q_60', 'has_bought_category_a_60', 'has_bought_category_90', 'has_bought_category_q_90', 'has_bought_category_a_90', 'has_bought_category_180', 'has_bought_category_q_180', 'has_bought_category_a_180', 'has_bought_brand', 'has_bought_brand_q', 'has_bought_brand_a', 'has_bought_brand_30', 'has_bought_brand_q_30', 'has_bought_brand_a_30', 'has_bought_brand_60', 'has_bought_brand_q_60', 'has_bought_brand_a_60', 'has_bought_brand_90', 'has_bought_brand_q_90', 'has_bought_brand_a_90', 
'has_bought_brand_180', 'has_bought_brand_q_180', 'has_bought_brand_a_180', 'never_bought_company', 'never_bought_category', 'never_bought_brand', 'has_bought_brand_company_category', 'has_bought_brand_category', 'has_bought_brand_company']

In [None]:
train_df = pd.DataFrame(train_list,columns=columns)

In [None]:
test_df = pd.DataFrame(test_list,columns=columns)

In [None]:
train_df.to_csv(loc_out_train, index = False)
test_df.to_csv(loc_out_test, index = False)

In [None]:
example = example.append(['12262064', 0.5, '1.5', '1', 4118.179999999954, 3, 27, 6, 1.0, 1.0, 1.95, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3.0, 4.0, 56.79, 0, 0, 0, 1.0, 1.0, 2.79, 1.0, 1.0, 2.79, 1.0, 1.0, 2.79, 0, 1, 0, 0, 0, 1], axis =0)

In [None]:
train_df.to_csv(loc_out_train, index =False)
test_df.to_csv(loc_out_test, index=False)

In [None]:
train_df

In [None]:
from pyspark.ml.feature import StringIndexer

# Convert offers to numerical categories
indexer = StringIndexer().setInputCol("offer").setOutputCol("offer_idx").fit(offer_df)

In [None]:
from pyspark.ml.feature import OneHotEncoder

# One-hot-encoding of offer category
encoder = OneHotEncoder().setOutputCol("encoded").setDropLast(False)
train_df = encoder.setInputCol("offer_idx").transform(indexer.transform(train_df))

In [None]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=["encoded"], outputCol="features")
train_df = train_df.withColumnRenamed('repeater', 'label')

In [None]:
output = assembler.transform(train_df)
output.cache()

In [None]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(maxIter=10)

In [None]:
# Fit the model
lrModel = lr.fit(output)

In [None]:
# Print the coefficients and intercept for logistic regression
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))

In [None]:
train_pred = lrModel.transform(output.select('features'))

In [None]:
train_pred.show()

# Predicting on Testing data

In [None]:
test_df = encoder.setInputCol("offer_idx").transform(indexer.transform(test_df))

In [None]:
test_output = assembler.transform(test_df)
test_output.cache()

In [None]:
test_pred = lrModel.transform(test_output.select('features'))

In [42]:
test_id = test_df.select('id').toPandas()
test_prob = test_pred.select('probability').toPandas()

In [43]:
prob = []
for i in range(test_prob.shape[0]):
    prob.append(test_prob['probability'][i][1])
test_prob['probYes'] = prob

In [44]:
test_final_pred = pd.merge(test_id, test_prob, left_index=True, right_index=True)

In [45]:
test_final_pred.head()

Unnamed: 0,id,probability,probYes
0,12262064,"[0.737425892072, 0.262574107928]",0.262574
1,12277270,"[0.737425892072, 0.262574107928]",0.262574
2,12332190,"[0.737425892072, 0.262574107928]",0.262574
3,12524696,"[0.737425892072, 0.262574107928]",0.262574
4,13074629,"[0.737425892072, 0.262574107928]",0.262574


In [46]:
test_final_pred['probYes'].describe()

count    151484.000000
mean          0.262794
std           0.018403
min           0.143080
25%           0.262574
50%           0.262574
75%           0.262574
max           0.434382
Name: probYes, dtype: float64

In [47]:
a = 3

In [50]:
def test(x):
    if a == 3:
        print "yes"
    a +=1

In [51]:
test(5)

UnboundLocalError: local variable 'a' referenced before assignment