In [1]:
from pyspark.sql import Row
from pyspark.sql.types import *
from datetime import datetime
import pyspark.sql.functions as F
import pandas as pd
import numpy as np

In [2]:
file_name = 'transactions.csv'
chunksize = 10 ** 6
chunk_iter = pd.read_csv(file_name, chunksize=chunksize)
chunk1 = chunk_iter.next()
chunk1.to_csv('transactions_chunk1.csv', index=False)
lines = sc.textFile('transactions_chunk1.csv')
header = lines.first()
lines = lines.filter(lambda row: row != header)

In [3]:
# Functions for parsing the strings from the CSV file

def toIntSafe(inval):
  try:
    return int(inval)
  except ValueError:
    return None

def toTimeSafe(inval):
  try:
    return datetime.strptime(inval, "%Y-%m-%d")
  except ValueError:
    return None

def toLongSafe(inval):
  try:
    return long(inval)
  except ValueError:
    return None

def toFloatSafe(inval):
  try:
    return float(inval)
  except ValueError:
    return None
    
def stringToPost(row):
  r = row.encode('utf8').split(',')
  return Row(
    int(r[0]),         # Don't want this column to be nullable
    toLongSafe(r[1]),
    toLongSafe(r[2]),
    toLongSafe(r[3]),
    toLongSafe(r[4]),
    toLongSafe(r[5]),
    toTimeSafe(r[6]),
    toFloatSafe(r[7]),
    r[8],
    toLongSafe(r[9]),
    toFloatSafe(r[10]))  

In [4]:
transactions_schema = StructType([
    StructField("id", LongType(), False),
    StructField("chain", LongType(), True),
    StructField("dept", LongType(), True),
    StructField("category", LongType(), True),
    StructField("company", LongType(), True),
    StructField("brand", LongType(), True),
    StructField("date", DateType(), True),
    StructField("productsize", DoubleType(), True),
    StructField("purchasemeasure", StringType(), True),
    StructField("purchasequantity", LongType(), True),
    StructField("purchaseamount", DoubleType(), True)
    ])

In [5]:
rowRDD = lines.map(lambda p: stringToPost(p))
transactions = sqlContext.createDataFrame(rowRDD, transactions_schema)

In [6]:
transactions.show()

+-----+-----+----+--------+----------+-----+----------+-----------+---------------+----------------+--------------+
|   id|chain|dept|category|   company|brand|      date|productsize|purchasemeasure|purchasequantity|purchaseamount|
+-----+-----+----+--------+----------+-----+----------+-----------+---------------+----------------+--------------+
|86246|  205|   7|     707|1078778070|12564|2012-03-02|       12.0|             OZ|               1|          7.59|
|86246|  205|  63|    6319| 107654575|17876|2012-03-02|       64.0|             OZ|               1|          1.59|
|86246|  205|  97|    9753|1022027929|    0|2012-03-02|        1.0|             CT|               1|          5.99|
|86246|  205|  25|    2509| 107996777|31373|2012-03-02|       16.0|             OZ|               1|          1.99|
|86246|  205|  55|    5555| 107684070|32094|2012-03-02|       16.0|             OZ|               2|         10.38|
|86246|  205|  97|    9753|1021015020|    0|2012-03-02|        1.0|     

In [7]:
train = pd.read_csv('trainHistory.csv')
test = pd.read_csv('testHistory.csv')
offers = pd.read_csv('offers.csv')
# Add department to offers
dept = []
for i in range(offers.shape[0]):
    str_category = str(offers['category'].iloc[i])
    if len(str_category) == 4:
        dept.append(str_category[:2])
    else:
        dept.append(str_category[:1])
        
offers['dept'] = dept
offers['dept'] = pd.to_numeric(offers['dept'])

In [8]:
# Reencode the target variable
train['repeater'] = np.where(train['repeater'] == 't', '1', '0')
train['repeater'] = pd.to_numeric(train['repeater'])

In [9]:
train = pd.merge(train, offers, how='left', on=['offer'])
test = pd.merge(test, offers, how='left', on=['offer'])
# Rename columns
train.columns = ['id', 'chain', 'offer', 'market', 'repeattrips', 'repeater', 'offerdate', 
                 'offer_category', 'quantity', 'offer_company', 'offervalue', 'offer_brand',
                 'offer_dept']
test.columns = ['id', 'chain', 'offer', 'market', 'offerdate', 'offer_category', 'quantity', 
                'offer_company', 'offervalue', 'offer_brand', 'offer_dept']

In [174]:
train_schema = StructType([
    StructField("id", LongType(), False),
    StructField("offer_chain", LongType(), True),
    StructField("offer", LongType(), True),
    StructField("market", LongType(), True),
    StructField("repeattrips", LongType(), True),
    StructField("repeater", IntegerType(), True),
    StructField("offerdate", StringType(), True),
    StructField("offer_category", LongType(), True),
    StructField("quantity", LongType(), True),
    StructField("offer_company", LongType(), True),
    StructField("offervalue", DoubleType(), True),
    StructField("offer_brand", LongType(), True),
    StructField("offer_dept", LongType(), True)
    ])

train_df = sqlContext.createDataFrame(train, train_schema)
test_schema = StructType([
    StructField("id", LongType(), False),
    StructField("offer_chain", LongType(), True),
    StructField("offer", LongType(), True),
    StructField("market", LongType(), True),
    StructField("offerdate", StringType(), True),
    StructField("offer_category", LongType(), True),
    StructField("quantity", LongType(), True),
    StructField("offer_company", LongType(), True),
    StructField("offervalue", DoubleType(), True),
    StructField("offer_brand", LongType(), True),
    StructField("offer_dept", LongType(), True)
    ])
test_df = sqlContext.createDataFrame(test, test_schema)

In [154]:
chain = train_df.join(transactions, (train_df.id == transactions.id) & (train_df.offer_chain == transactions.chain), 
                          "left").select(train_df["id"],"offer_chain","chain")


In [155]:
chain = chain.withColumn("matchChainCount", F.expr("case when offer_chain = chain then 1 else 0 end"))
chain = chain.groupBy("id").sum("matchChainCount").withColumnRenamed("sum(matchChainCount)", "matchChainCount")

In [156]:
dept = train_df.join(transactions, (train_df.id == transactions.id) & (train_df.offer_dept == transactions.dept), 
                          "left").select(train_df["id"],"offer_dept","dept")
dept = dept.withColumn("matchDeptCount", F.expr("case when offer_dept = dept then 1 else 0 end"))
dept = dept.groupBy("id").sum("matchDeptCount").withColumnRenamed("sum(matchDeptCount)", "matchDeptCount")

In [157]:
category = train_df.join(transactions, (train_df.id == transactions.id) & (train_df.offer_category == transactions.category), 
                          "left").select(train_df["id"],"offer_category","category")
category = category.withColumn("matchCategoryCount", F.expr("case when offer_category = category then 1 else 0 end"))
category = category.groupBy("id").sum("matchCategoryCount").withColumnRenamed("sum(matchCategoryCount)", "matchCategoryCount")

In [158]:
company = train_df.join(transactions, (train_df.id == transactions.id) & (train_df.offer_company == transactions.company), 
                          "left").select(train_df["id"],"offer_company","company")
company = company.withColumn("matchCompanyCount", F.expr("case when offer_company = company then 1 else 0 end"))
company = company.groupBy("id").sum("matchCompanyCount").withColumnRenamed("sum(matchCompanyCount)", "matchCompanyCount")

In [159]:
brand = train_df.join(transactions, (train_df.id == transactions.id) & (train_df.offer_brand == transactions.brand), 
                          "left").select(train_df["id"],"offer_brand","brand")
brand = brand.withColumn("matchBrandCount", F.expr("case when offer_brand = brand then 1 else 0 end"))
brand = brand.groupBy("id").sum("matchBrandCount").withColumnRenamed("sum(matchBrandCount)", "matchBrandCount")

In [169]:
repeats = train_df.select("id", "repeater")
match = chain.join(dept, "id", "left").join(category, "id", "left").join(company, "id", "left").join(brand, "id", "left").join(repeats, "id", "left")

In [183]:
from pyspark.ml.feature import VectorAssembler
match = match.withColumnRenamed("repeater", "label")
va = VectorAssembler(outputCol="features", inputCols = match.columns[1:6])

penlpoints = va.transform(match).select("features", "label")

In [172]:
penlpoints.show()

+---------------+--------+
|       features|repeater|
+---------------+--------+
|(5,[0],[387.0])|       0|
|      (5,[],[])|       0|
|      (5,[],[])|       1|
|      (5,[],[])|       0|
|      (5,[],[])|       0|
|      (5,[],[])|       0|
|      (5,[],[])|       0|
|      (5,[],[])|       0|
|      (5,[],[])|       0|
|      (5,[],[])|       1|
|      (5,[],[])|       0|
|      (5,[],[])|       0|
|      (5,[],[])|       0|
|      (5,[],[])|       1|
|      (5,[],[])|       1|
|      (5,[],[])|       1|
|      (5,[],[])|       1|
|      (5,[],[])|       0|
|      (5,[],[])|       1|
|      (5,[],[])|       0|
+---------------+--------+
only showing top 20 rows



In [177]:
# Test and Transaction

test_chain = test_df.join(transactions, (test_df.id == transactions.id) & (test_df.offer_chain == transactions.chain), 
                          "left").select(test_df["id"],"offer_chain","chain")
test_chain = test_chain.withColumn("matchChainCount", F.expr("case when offer_chain = chain then 1 else 0 end"))
test_chain = test_chain.groupBy("id").sum("matchChainCount").withColumnRenamed("sum(matchChainCount)", "matchChainCount")

test_dept = test_df.join(transactions, (test_df.id == transactions.id) & (test_df.offer_dept == transactions.dept), 
                          "left").select(test_df["id"],"offer_dept","dept")
test_dept = test_dept.withColumn("matchDeptCount", F.expr("case when offer_dept = dept then 1 else 0 end"))
test_dept = test_dept.groupBy("id").sum("matchDeptCount").withColumnRenamed("sum(matchDeptCount)", "matchDeptCount")

test_category = test_df.join(transactions, (test_df.id == transactions.id) & (test_df.offer_category == transactions.category), 
                          "left").select(test_df["id"],"offer_category","category")
test_category = test_category.withColumn("matchCategoryCount", F.expr("case when offer_category = category then 1 else 0 end"))
test_category = test_category.groupBy("id").sum("matchCategoryCount").withColumnRenamed("sum(matchCategoryCount)", "matchCategoryCount")

test_company = test_df.join(transactions, (test_df.id == transactions.id) & (test_df.offer_company == transactions.company), 
                          "left").select(test_df["id"],"offer_company","company")
test_company = test_company.withColumn("matchCompanyCount", F.expr("case when offer_company = company then 1 else 0 end"))
test_company = test_company.groupBy("id").sum("matchCompanyCount").withColumnRenamed("sum(matchCompanyCount)", "matchCompanyCount")

test_brand = test_df.join(transactions, (test_df.id == transactions.id) & (test_df.offer_brand == transactions.brand), 
                          "left").select(test_df["id"],"offer_brand","brand")
test_brand = test_brand.withColumn("matchBrandCount", F.expr("case when offer_brand = brand then 1 else 0 end"))
test_brand = test_brand.groupBy("id").sum("matchBrandCount").withColumnRenamed("sum(matchBrandCount)", "matchBrandCount")


In [178]:
test_match = test_chain.join(test_dept, "id", "left").join(test_category, "id", "left").join(test_company, "id", "left").join(test_brand, "id", "left")

In [179]:
va = VectorAssembler(outputCol="features", inputCols = test_match.columns[1:6])
testpoints = va.transform(test_match).select("features")

In [184]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(regParam=0.01, maxIter=1000, fitIntercept=True)
lrmodel = lr.fit(penlpoints)

NameError: name 'lrmocel' is not defined

In [185]:
pred = lrmodel.transform(testpoints)

In [186]:
pred.show()

+--------------------+--------------------+--------------------+----------+
|            features|       rawPrediction|         probability|prediction|
+--------------------+--------------------+--------------------+----------+
|[517.0,0.0,0.0,2....|[0.87300474302017...|[0.70537053764279...|       0.0|
|[419.0,6.0,2.0,4....|[1.01912578103353...|[0.73480227723325...|       0.0|
|[710.0,93.0,54.0,...|[3.07034757406588...|[0.95565290508235...|       0.0|
|           (5,[],[])|[0.98770750770535...|[0.72863487446255...|       0.0|
|           (5,[],[])|[0.98770750770535...|[0.72863487446255...|       0.0|
|           (5,[],[])|[0.98770750770535...|[0.72863487446255...|       0.0|
|           (5,[],[])|[0.98770750770535...|[0.72863487446255...|       0.0|
|           (5,[],[])|[0.98770750770535...|[0.72863487446255...|       0.0|
|           (5,[],[])|[0.98770750770535...|[0.72863487446255...|       0.0|
|           (5,[],[])|[0.98770750770535...|[0.72863487446255...|       0.0|
|           

In [188]:
pred.first()

Row(features=DenseVector([517.0, 0.0, 0.0, 2.0, 2.0]), rawPrediction=DenseVector([0.873, -0.873]), probability=DenseVector([0.7054, 0.2946]), prediction=0.0)

In [187]:
test_prob = pred.select('probability').toPandas()
prob = []
for i in range(test_prob.shape[0]):
    prob.append(test_prob['probability'][i][1])
test_prob['repeatProbability'] = prob

In [190]:
test_id = test_df.select("id").toPandas()
test_final_pred = pd.merge(test_id, test_prob, left_index=True, right_index=True)
test_final_pred.drop('probability', 1, inplace=True)


In [192]:
test_final_pred.head(10)
test_prob.head(10)

Unnamed: 0,probability,repeatProbability
0,"[0.705370537643, 0.294629462357]",0.294629
1,"[0.734802277233, 0.265197722767]",0.265198
2,"[0.955652905082, 0.0443470949176]",0.044347
3,"[0.728634874463, 0.271365125537]",0.271365
4,"[0.728634874463, 0.271365125537]",0.271365
5,"[0.728634874463, 0.271365125537]",0.271365
6,"[0.728634874463, 0.271365125537]",0.271365
7,"[0.728634874463, 0.271365125537]",0.271365
8,"[0.728634874463, 0.271365125537]",0.271365
9,"[0.728634874463, 0.271365125537]",0.271365


In [193]:
test_final_pred.to_csv('submission.csv', index=False)