In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score

# Import transaction, offers, train and test data

In [3]:
chunksize = 10 ** 6
transactions = pd.read_csv("transactions.csv", header=0, chunksize=chunksize)
transactions = pd.DataFrame(transactions.next())

offers = pd.read_csv("offers.csv", header=0)
train = pd.read_csv("trainHistory.csv", header=0)
train["repeater"] = train.repeater.apply(lambda x: 1 if x == "t" else 0)
train = train.merge(offers, how="inner", on="offer")
train = train.drop(["chain", "brand", "offer", "repeattrips", "offerdate", "market", "category", "company"], axis=1)

test = pd.read_csv("testHistory.csv", header=0)
test = test.merge(offers, how="inner", on="offer")
test = test.drop(["chain", "brand", "offer", "offerdate", "market", "category", "company"], axis=1)

# Features generation, super rough and course

In [4]:
#Total counts
gross_amount = transactions.groupby(by="id").id.count().to_frame(name="total")

#Chain
chain = transactions[["id", "chain"]]
chain = pd.get_dummies(chain, columns=["chain"])
chain = chain.groupby(['id']).sum()
chain = chain.idxmax(axis=1)
chain = chain.apply(lambda x : x.strip("chain_"))
chain = chain.to_frame(name="chain")

#Dept
dept = transactions[["id", "dept"]]
dept = pd.get_dummies(dept, columns=["dept"])
dept = dept.groupby(['id']).sum()
dept = dept.idxmax(axis=1)
dept = dept.apply(lambda x : x.strip("dept_"))
dept = dept.to_frame(name="dept")

#Company
comp = transactions.groupby(by="id").company.unique()
comp = comp.apply(lambda x : len(x)).to_frame(name="comp")

#Brand
brand = transactions.groupby(by="id").brand.unique()
brand = brand.apply(lambda x : len(x)).to_frame(name="brand")

#pquantity
pquantity = transactions.groupby(by="id").purchasequantity.mean().to_frame(name="pquantity")

#purchaseamount
pamount = transactions.groupby(by="id").purchaseamount.mean().to_frame(name="pamount")

# Combine features with train an

In [5]:
trans_summary = gross_amount.merge(chain, left_index=True, right_index=True).merge(dept, left_index=True, right_index=True).merge(comp, left_index=True, right_index=True).merge(brand, left_index=True, right_index=True).merge(pquantity, left_index=True, right_index=True).merge(pamount, left_index=True, right_index=True)
trans_summary["id"] = trans_summary.index

train = train.merge(trans_summary, how="inner", on="id")
test = test.merge(trans_summary, how="inner", on="id")

In [6]:
train.head(10)

Unnamed: 0,id,repeater,quantity,offervalue,total,chain,dept,comp,brand,pquantity,pamount
0,86246,1,1,2.0,12609,205,9,954,1319,1.374653,4.189715
1,15753725,0,1,2.0,1820,17,51,267,322,1.855495,5.168456
2,16535563,1,1,2.0,1012,4,63,146,176,1.571146,6.299022
3,18259179,0,1,2.0,197,3,26,92,99,1.213198,4.636548
4,21024070,1,1,2.0,2034,15,99,326,409,1.176008,3.747439
5,21603296,0,1,2.0,703,4,5,174,224,1.644381,5.581152
6,23791057,1,1,2.0,1366,95,9,251,349,1.590776,5.444283
7,48464291,0,1,2.0,1049,18,99,160,176,1.232602,3.573346
8,54625209,0,1,2.0,1716,4,9,254,276,1.13345,3.213287
9,55882738,1,1,2.0,1033,4,63,166,195,1.339787,5.686302


# Super simpilfied Logistic regression lol (cross_val)

In [7]:
algo = LogisticRegression()
algo.fit(train.iloc[:,2:],train.iloc[:,1])
hyp = algo.predict(test.iloc[:,1:])
kfold = KFold(n_splits=10, shuffle=True, random_state=100)
score = cross_val_score(algo, train.iloc[:,2:], train.iloc[:,1], cv=kfold, scoring="accuracy")
print "Cross-Val: ", np.mean(score)

Cross-Val:  0.77
