In [1]:
import pandas as pd
import numpy as np
from pyspark.sql import Row
from pyspark.sql.types import *
from datetime import datetime
import pyspark.sql.functions as F

### Skip this cell if you already have a chunk of the transactions data.

In [2]:
file_name = '../data/transactions.csv'
chunksize = 10 ** 5
chunk_iter = pd.read_csv(file_name, chunksize=chunksize)
chunk1 = chunk_iter.next()
chunk1.to_csv('../data/transactions_chunk1.csv', index=False)

In [3]:
lines = sc.textFile('../data/transactions_chunk1.csv')

In [4]:
header = lines.first()
lines = lines.filter(lambda row: row != header)

In [5]:
# Functions for parsing the strings from the CSV file

def toIntSafe(inval):
  try:
    return int(inval)
  except ValueError:
    return None

def toTimeSafe(inval):
  try:
    return datetime.strptime(inval, "%Y-%m-%d")
  except ValueError:
    return None

def toLongSafe(inval):
  try:
    return long(inval)
  except ValueError:
    return None

def toFloatSafe(inval):
  try:
    return float(inval)
  except ValueError:
    return None
    
def stringToPost(row):
  r = row.encode('utf8').split(',')
  return Row(
    int(r[0]),         # Don't want this column to be nullable
    toLongSafe(r[1]),
    toLongSafe(r[2]),
    toLongSafe(r[3]),
    toLongSafe(r[4]),
    toLongSafe(r[5]),
    toTimeSafe(r[6]),
    toFloatSafe(r[7]),
    r[8],
    toLongSafe(r[9]),
    toFloatSafe(r[10]))  

In [6]:
transactions_schema = StructType([
    StructField("id", LongType(), False),
    StructField("chain", LongType(), True),
    StructField("dept", LongType(), True),
    StructField("category", LongType(), True),
    StructField("company", LongType(), True),
    StructField("brand", LongType(), True),
    StructField("date", DateType(), True),
    StructField("productsize", DoubleType(), True),
    StructField("purchasemeasure", StringType(), True),
    StructField("purchasequantity", LongType(), True),
    StructField("purchaseamount", DoubleType(), True)
    ])

In [7]:
rowRDD = lines.map(lambda p: stringToPost(p))

In [8]:
transactions = sqlContext.createDataFrame(rowRDD, transactions_schema)

In [9]:
transactions.write.saveAsTable('transactions')

In [10]:
transactions.printSchema()

root
 |-- id: long (nullable = false)
 |-- chain: long (nullable = true)
 |-- dept: long (nullable = true)
 |-- category: long (nullable = true)
 |-- company: long (nullable = true)
 |-- brand: long (nullable = true)
 |-- date: date (nullable = true)
 |-- productsize: double (nullable = true)
 |-- purchasemeasure: string (nullable = true)
 |-- purchasequantity: long (nullable = true)
 |-- purchaseamount: double (nullable = true)



In [11]:
transactions.show()

+-----+-----+----+--------+----------+-----+----------+-----------+---------------+----------------+--------------+
|   id|chain|dept|category|   company|brand|      date|productsize|purchasemeasure|purchasequantity|purchaseamount|
+-----+-----+----+--------+----------+-----+----------+-----------+---------------+----------------+--------------+
|86246|  205|   7|     707|1078778070|12564|2012-03-02|       12.0|             OZ|               1|          7.59|
|86246|  205|  63|    6319| 107654575|17876|2012-03-02|       64.0|             OZ|               1|          1.59|
|86246|  205|  97|    9753|1022027929|    0|2012-03-02|        1.0|             CT|               1|          5.99|
|86246|  205|  25|    2509| 107996777|31373|2012-03-02|       16.0|             OZ|               1|          1.99|
|86246|  205|  55|    5555| 107684070|32094|2012-03-02|       16.0|             OZ|               2|         10.38|
|86246|  205|  97|    9753|1021015020|    0|2012-03-02|        1.0|     

Get number of refund transactions

In [12]:
import pyspark.sql.functions as F

returns = transactions.select('id',
                              F.when(transactions['purchaseamount'] < 0, 1).otherwise(0).alias('return'))\
                      .withColumn('1', F.lit(1))
returns = returns.groupBy('id').agg(F.sum('1').alias('total_trans'), F.sum("return").alias('total_returns'))
returns.show()

+--------+-----------+-------------+
|      id|total_trans|total_returns|
+--------+-----------+-------------+
|18470775|        350|            8|
|14723452|        755|            5|
|15738658|         39|            0|
|17552659|        591|            4|
|12996040|        326|            5|
|16078766|        966|           54|
|18249735|       1557|           60|
|14989775|        614|           39|
|15073302|        526|           38|
|16075389|        591|           32|
|16606739|        678|            6|
|15705695|        431|            8|
|14576147|        817|           66|
|15134033|        944|           13|
|16551772|       1699|           47|
|17652157|       1407|           15|
|13089312|       1218|           52|
|13744500|       2232|          112|
|16829614|        738|           10|
|17524817|        328|            7|
+--------+-----------+-------------+
only showing top 20 rows



# Training data and testing data

In [13]:
train = pd.read_csv('../data/trainHistory.csv')
offers = pd.read_csv('../data/offers.csv')

In [14]:
# Add department to offers
dept = []
for i in range(offers.shape[0]):
    str_category = str(offers['category'].iloc[i])
    if len(str_category) == 4:
        dept.append(str_category[:2])
    else:
        dept.append(str_category[:1])
        
offers['dept'] = dept
offers['dept'] = pd.to_numeric(offers['dept'])

In [15]:
train = pd.merge(train, offers, how='left', on=['offer'])

In [16]:
# Rename columns
train.columns = ['id', 'chain', 'offer', 'market', 'repeattrips', 'repeater', 'offerdate', 
                 'offer_category', 'quantity', 'offer_company', 'offervalue', 'offer_brand',
                 'offer_dept']

In [17]:
# Reencode the target variable
train['repeater'] = np.where(train['repeater'] == 't', '1', '0')
train['repeater'] = pd.to_numeric(train['repeater'])

In [18]:
train['offerdate'] = train['offerdate'].astype(str)

In [19]:
test = pd.read_csv('../data/testHistory.csv')
test = pd.merge(test, offers, how='left', on=['offer'])

In [20]:
# Rename columns
test.columns = ['id', 'chain', 'offer', 'market', 'offerdate', 'offer_category', 'quantity', 
                'offer_company', 'offervalue', 'offer_brand', 'offer_dept']

In [21]:
test['offerdate'] = test['offerdate'].astype(str)

In [22]:
train_schema = StructType([
    StructField("id", LongType(), False),
    StructField("offer_chain", LongType(), True),
    StructField("offer", LongType(), True),
    StructField("market", LongType(), True),
    StructField("repeattrips", LongType(), True),
    StructField("repeater", IntegerType(), True),
    StructField("offerdate", StringType(), True),
    StructField("offer_category", LongType(), True),
    StructField("quantity", LongType(), True),
    StructField("offer_company", LongType(), True),
    StructField("offervalue", DoubleType(), True),
    StructField("offer_brand", LongType(), True),
    StructField("offer_dept", LongType(), True)
    ])

In [23]:
train_df = sqlContext.createDataFrame(train, train_schema)

In [24]:
test_schema = StructType([
    StructField("id", LongType(), False),
    StructField("offer_chain", LongType(), True),
    StructField("offer", LongType(), True),
    StructField("market", LongType(), True),
    StructField("offerdate", StringType(), True),
    StructField("offer_category", LongType(), True),
    StructField("quantity", LongType(), True),
    StructField("offer_company", LongType(), True),
    StructField("offervalue", DoubleType(), True),
    StructField("offer_brand", LongType(), True),
    StructField("offer_dept", LongType(), True)
    ])

In [25]:
test_df = sqlContext.createDataFrame(test, test_schema)

In [26]:
# This function converts the string cell into a date:
stringToDate = F.udf(lambda x: datetime.strptime(x, '%Y-%m-%d'), DateType())

In [28]:
# Convert offerdate to date type
train_df = train_df.withColumn('offerdate', stringToDate(F.col('offerdate')))
test_df = test_df.withColumn('offerdate', stringToDate(F.col('offerdate')))

In [29]:
offer_df = train_df.select('offer').union(test_df.select('offer'))

In [30]:
from pyspark.ml.feature import StringIndexer

# Convert offers to numerical categories
indexer = StringIndexer().setInputCol("offer").setOutputCol("offer_idx").fit(offer_df)

In [31]:
from pyspark.ml.feature import OneHotEncoder

# One-hot-encoding of offer category
encoder = OneHotEncoder().setOutputCol("encoded").setDropLast(False)
train_df = encoder.setInputCol("offer_idx").transform(indexer.transform(train_df))

In [32]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=["encoded"], outputCol="features")
train_df = train_df.withColumnRenamed('repeater', 'label')

In [33]:
output = assembler.transform(train_df)
output.cache()

DataFrame[id: bigint, offer_chain: bigint, offer: bigint, market: bigint, repeattrips: bigint, label: int, offerdate: date, offer_category: bigint, quantity: bigint, offer_company: bigint, offervalue: double, offer_brand: bigint, offer_dept: bigint, offer_idx: double, encoded: vector, features: vector]

In [34]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(maxIter=10)

In [35]:
# Fit the model
lrModel = lr.fit(output)

In [36]:
# Print the coefficients and intercept for logistic regression
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))

Coefficients: [-0.379931268312,0.0,0.0,0.0,0.768636468499,0.283150286488,0.724675544876,0.0,-1.50125193043,0.110483492047,0.0,0.533911070781,-1.09949972076,1.06082166521,-0.757308249666,0.0,0.0,0.207831736982,0.0,0.0,0.374098326264,0.0,-1.34532985933,-0.581996104996,0.0,-0.502179595966,-0.36126825406,-0.267531258209,0.0,0.0,-0.617348739884,-0.442416183827,-0.578967012375,-1.10155950142,-0.394488996688,-0.292103966717,-0.177202770959]
Intercept: -1.03263223974


In [37]:
train_pred = lrModel.transform(output.select('features'))

In [38]:
train_pred.show()

+---------------+--------------------+--------------------+----------+
|       features|       rawPrediction|         probability|prediction|
+---------------+--------------------+--------------------+----------+
| (37,[5],[1.0])|[0.74948195324844...|[0.67906580890541...|       0.0|
| (37,[0],[1.0])|[1.41256350804840...|[0.80416996131692...|       0.0|
| (37,[0],[1.0])|[1.41256350804840...|[0.80416996131692...|       0.0|
| (37,[0],[1.0])|[1.41256350804840...|[0.80416996131692...|       0.0|
|(37,[23],[1.0])|[1.61462834473266...|[0.83405298000005...|       0.0|
| (37,[0],[1.0])|[1.41256350804840...|[0.80416996131692...|       0.0|
|(37,[14],[1.0])|[1.78994048940313...|[0.85691998019203...|       0.0|
|(37,[14],[1.0])|[1.78994048940313...|[0.85691998019203...|       0.0|
| (37,[9],[1.0])|[0.92214874768961...|[0.71547972547370...|       0.0|
| (37,[0],[1.0])|[1.41256350804840...|[0.80416996131692...|       0.0|
| (37,[0],[1.0])|[1.41256350804840...|[0.80416996131692...|       0.0|
|(37,[

# Predicting on Testing data

In [39]:
test_df = encoder.setInputCol("offer_idx").transform(indexer.transform(test_df))

In [40]:
test_output = assembler.transform(test_df)
test_output.cache()

DataFrame[id: bigint, offer_chain: bigint, offer: bigint, market: bigint, offerdate: date, offer_category: bigint, quantity: bigint, offer_company: bigint, offervalue: double, offer_brand: bigint, offer_dept: bigint, offer_idx: double, encoded: vector, features: vector]

In [41]:
test_pred = lrModel.transform(test_output.select('features'))

In [42]:
test_id = test_df.select('id').toPandas()
test_prob = test_pred.select('probability').toPandas()

In [43]:
prob = []
for i in range(test_prob.shape[0]):
    prob.append(test_prob['probability'][i][1])
test_prob['probYes'] = prob

In [44]:
test_final_pred = pd.merge(test_id, test_prob, left_index=True, right_index=True)

In [45]:
test_final_pred.head()

Unnamed: 0,id,probability,probYes
0,12262064,"[0.737425892072, 0.262574107928]",0.262574
1,12277270,"[0.737425892072, 0.262574107928]",0.262574
2,12332190,"[0.737425892072, 0.262574107928]",0.262574
3,12524696,"[0.737425892072, 0.262574107928]",0.262574
4,13074629,"[0.737425892072, 0.262574107928]",0.262574


In [46]:
test_final_pred['probYes'].describe()

count    151484.000000
mean          0.262794
std           0.018403
min           0.143080
25%           0.262574
50%           0.262574
75%           0.262574
max           0.434382
Name: probYes, dtype: float64