In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import collections

In [2]:
campaigns = pd.read_csv('campaign_data.csv')
customers = pd.read_csv('customer_demographics.csv')
transactions = pd.read_csv('customer_transaction_data.csv')
coupons = pd.read_csv('coupon_item_mapping.csv')
items = pd.read_csv('item_data.csv')
train = pd.read_csv('train.csv')

In [3]:
campaigns.head()

Unnamed: 0,campaign_id,campaign_type,start_date,end_date
0,24,Y,21/10/13,20/12/13
1,25,Y,21/10/13,22/11/13
2,20,Y,7/9/2013,16/11/13
3,23,Y,8/10/2013,15/11/13
4,21,Y,16/09/13,18/10/13


In [4]:
coupons.head()

Unnamed: 0,coupon_id,item_id
0,105,37
1,107,75
2,494,76
3,522,77
4,518,77


In [5]:
campaigns.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28 entries, 0 to 27
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   campaign_id    28 non-null     int64 
 1   campaign_type  28 non-null     object
 2   start_date     28 non-null     object
 3   end_date       28 non-null     object
dtypes: int64(1), object(3)
memory usage: 1.0+ KB


In [6]:
transactions.head()

Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount
0,2012-01-02,1501,26830,1,35.26,-10.69,0.0
1,2012-01-02,1501,54253,1,53.43,-13.89,0.0
2,2012-01-02,1501,31962,1,106.5,-14.25,0.0
3,2012-01-02,1501,33647,1,67.32,0.0,0.0
4,2012-01-02,1501,48199,1,71.24,-28.14,0.0


How many times, overall, did customers redeem coupons over this period?

In [7]:
train[(train['campaign_id']==13) & (train['redemption_status']==1)].groupby('customer_id').agg('count').sort_values('redemption_status')

Unnamed: 0_level_0,id,campaign_id,coupon_id,redemption_status
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1398,1,1,1,1
1317,1,1,1,1
370,1,1,1,1
378,1,1,1,1
385,1,1,1,1
...,...,...,...,...
1136,7,7,7,7
959,7,7,7,7
1070,8,8,8,8
766,8,8,8,8


We want to explore if customers redeemed coupons in the same campaign multiple times. We look at campaign id 13 below and see that customer 1 redeemed 18 coupons, customer 2 19 coupons and so on

In [8]:
train[train['campaign_id']==13].groupby('customer_id').nunique()

Unnamed: 0_level_0,id,campaign_id,coupon_id,customer_id,redemption_status
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,18,1,18,1,1
3,19,1,19,1,1
5,26,1,26,1,1
7,20,1,20,1,1
8,22,1,22,1,2
...,...,...,...,...,...
1574,34,1,33,1,2
1578,17,1,17,1,1
1579,20,1,20,1,1
1580,17,1,17,1,1


Further, we look at how many unique coupon_id's are associated with each campaign in the training set.

In [9]:
train.groupby('campaign_id')['coupon_id'].nunique()

campaign_id
1      11
2      16
3      34
4      12
5      11
6       1
7       8
8     208
9      18
10     14
11     13
12     15
13    207
26    181
27     27
28     28
29     33
30    178
Name: coupon_id, dtype: int64

Similarly, we look at how many unique campaigns are associated with each coupon and see that some coupons have been used for more than one campaign.

In [10]:
coupons_to_campaigns = train.groupby('coupon_id').agg(['unique'])['campaign_id']
print(coupons_to_campaigns.shape)
coupons_to_campaigns

(866, 1)


Unnamed: 0_level_0,unique
coupon_id,Unnamed: 1_level_1
1,[26]
2,[26]
3,[29]
4,[30]
5,[30]
...,...
1108,[13]
1110,"[8, 30]"
1112,"[13, 1]"
1114,[13]


In [11]:
coupons_to_campaigns['length'] = coupons_to_campaigns['unique'].str.len()
coupons_to_campaigns[coupons_to_campaigns['length']>1].sort_values(by='length').head()

Unnamed: 0_level_0,unique,length
coupon_id,Unnamed: 1_level_1,Unnamed: 2_level_1
243,"[29, 26]",2
889,"[8, 30]",2
885,"[8, 13]",2
870,"[9, 26]",2
864,"[27, 3]",2


In [12]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1324566 entries, 0 to 1324565
Data columns (total 7 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   date             1324566 non-null  object 
 1   customer_id      1324566 non-null  int64  
 2   item_id          1324566 non-null  int64  
 3   quantity         1324566 non-null  int64  
 4   selling_price    1324566 non-null  float64
 5   other_discount   1324566 non-null  float64
 6   coupon_discount  1324566 non-null  float64
dtypes: float64(3), int64(3), object(1)
memory usage: 70.7+ MB


In [13]:
coupons.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92663 entries, 0 to 92662
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   coupon_id  92663 non-null  int64
 1   item_id    92663 non-null  int64
dtypes: int64(2)
memory usage: 1.4 MB


In [14]:
transactions.describe()

Unnamed: 0,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount
count,1324566.0,1324566.0,1324566.0,1324566.0,1324566.0,1324566.0
mean,804.002,29519.03,130.6633,114.6036,-17.76871,-0.5948983
std,457.3363,17908.06,1311.545,152.9053,37.88867,7.069367
min,1.0,1.0,1.0,0.36,-3120.31,-1992.23
25%,418.0,14684.0,1.0,49.16,-23.15,0.0
50%,801.0,26597.0,1.0,78.01,-1.78,0.0
75%,1198.0,42405.75,1.0,124.31,0.0,0.0
max,1582.0,74066.0,89638.0,17809.64,0.0,0.0


In [15]:
transactions['date'] = pd.to_datetime(transactions['date'], dayfirst=True)
transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1324566 entries, 0 to 1324565
Data columns (total 7 columns):
 #   Column           Non-Null Count    Dtype         
---  ------           --------------    -----         
 0   date             1324566 non-null  datetime64[ns]
 1   customer_id      1324566 non-null  int64         
 2   item_id          1324566 non-null  int64         
 3   quantity         1324566 non-null  int64         
 4   selling_price    1324566 non-null  float64       
 5   other_discount   1324566 non-null  float64       
 6   coupon_discount  1324566 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int64(3)
memory usage: 70.7 MB


In [16]:
campaigns['start_date'] = pd.to_datetime(campaigns['start_date'], dayfirst=True)
campaigns['end_date'] = pd.to_datetime(campaigns['end_date'], dayfirst=True)

In [17]:
transactions.sort_values(by='date').tail()

Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount
1323113,2013-07-03,208,14759,1,89.05,-6.77,0.0
1323112,2013-07-03,208,13212,1,120.75,0.0,0.0
1323111,2013-07-03,208,11332,1,142.12,0.0,0.0
1323125,2013-07-03,208,30932,1,106.5,0.0,0.0
1322974,2013-07-03,110,9561,4,142.48,-84.06,0.0


We only want to look at transactions that fall within the period that the campaigns were being run.

In [18]:
print(transactions.shape)
transactions_valid = transactions[(transactions['date'] >= '2012-08-12')]
print(transactions_valid.shape)

(1324566, 7)
(904357, 7)


In [19]:
train_with_item = pd.merge(train, coupons, on="coupon_id")
train_with_item.head()

Unnamed: 0,id,campaign_id,coupon_id,customer_id,redemption_status,item_id
0,1,13,27,1053,0,24775
1,1,13,27,1053,0,14958
2,1,13,27,1053,0,40431
3,1,13,27,1053,0,20749
4,1,13,27,1053,0,56860


In [20]:
transactions_valid['customer_id'].unique()

array([1401,  994,  360, ...,  985,  365, 1074], dtype=int64)

We want to investigate if customers can use a coupon discount on the same item multiple times in a single transaction. From our analysis below, it turns out that 520 customers have done that.

In [21]:
transactions_redeemed = transactions_valid[transactions_valid['coupon_discount']!=0]
transactions_redeemed.head()

Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount
416609,2012-08-12,265,18984,1,80.14,-53.07,-26.71
416627,2012-08-12,265,53873,1,80.14,-53.07,-26.71
416687,2012-08-12,1408,51615,1,142.12,-35.62,-35.62
416900,2012-08-12,657,9240,1,355.84,-71.24,-35.62
417079,2012-08-12,1369,29420,1,58.42,-3.56,-12.47


In [22]:
items_unique = []
for customer in transactions_redeemed['customer_id'].unique():
    df_customer = transactions_redeemed[transactions_redeemed['customer_id']==customer]
    for date in df_customer['date'].unique():
        df_date = df_customer[df_customer['date']==date]
        items_unique.append(df_date['item_id'].is_unique)
counter = collections.Counter(items_unique)
print(counter)

Counter({True: 5837, False: 520})


How many are false? Filter by false 

In [23]:
transactions_valid[transactions_valid['coupon_discount']!=0].shape

(15554, 7)

In [24]:
transactions_valid[transactions_valid['coupon_discount'] == 0].shape

(888803, 7)

Now we will add a new column 'campaign_id' to transactions_redeemed

In [25]:
transactions_redeemed['campaign_id'] = 0
for ind2 in transactions_redeemed.index:
    for ind1 in campaigns.index:
        campaign = campaigns['campaign_id'][ind1]
        campaign_start_date = campaigns['start_date'][ind1]
        campaign_end_date = campaigns['end_date'][ind1]
        
        
        
        if campaign_start_date <= transactions_redeemed['date'][ind2] <= campaign_end_date:
            transactions_redeemed['campaign_id'][ind2] = campaign
            

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


In [26]:
transactions_redeemed['campaign_id'].value_counts()

26    2320
30    1760
13    1461
8     1256
27    1156
28     960
11     931
10     913
4      812
6      742
2      700
29     579
12     572
7      555
9      464
5      333
3       40
Name: campaign_id, dtype: int64

In [27]:
transactions_redeemed.tail()

Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount,campaign_id
1324282,2013-07-03,384,12071,1,53.43,-26.71,-17.81,13
1324285,2013-07-03,384,22269,1,53.43,-26.71,-17.81,13
1324287,2013-07-03,384,33382,2,106.86,-53.43,-35.62,13
1324403,2013-07-03,1303,69686,1,78.36,0.0,-35.62,13
1324449,2013-07-03,621,72243,1,138.56,0.0,-35.62,13


In [28]:
transactions_redeemed.shape

(15554, 8)

In [29]:
transactions_valid[transactions_valid['coupon_discount']!=0].shape

(15554, 7)

We will map the campain_id to transactions_valid dataframe.

In [30]:
transactions_valid['campaign_id'] = 0
idx = transactions_valid.index[transactions_valid['coupon_discount']!=0]
transactions_valid['campaign_id'][idx] = transactions_redeemed['campaign_id']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_labels(key, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/i

In [31]:
transactions_valid['campaign_id'].value_counts()

0     888803
26      2320
30      1760
13      1461
8       1256
27      1156
28       960
11       931
10       913
4        812
6        742
2        700
29       579
12       572
7        555
9        464
5        333
3         40
Name: campaign_id, dtype: int64

In [32]:
transactions_valid[transactions_valid['campaign_id']!=0].tail()

Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount,campaign_id
1324282,2013-07-03,384,12071,1,53.43,-26.71,-17.81,13
1324285,2013-07-03,384,22269,1,53.43,-26.71,-17.81,13
1324287,2013-07-03,384,33382,2,106.86,-53.43,-35.62,13
1324403,2013-07-03,1303,69686,1,78.36,0.0,-35.62,13
1324449,2013-07-03,621,72243,1,138.56,0.0,-35.62,13


We want to group the data by customer, item and transaction date and sum up the column values.

In [33]:
transactions_by_customer = transactions_valid.groupby(['customer_id', 'item_id', 'date']).sum()
transactions_by_customer.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,quantity,selling_price,other_discount,coupon_discount,campaign_id
customer_id,item_id,date,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,4315,2013-06-20,1,201.97,0.0,0.0,0
1,4577,2012-11-07,2,120.4,0.0,0.0,0
1,4796,2012-10-18,1,106.5,0.0,0.0,0
1,5126,2013-03-12,1,95.82,0.0,0.0,0
1,5126,2013-06-24,1,95.82,0.0,0.0,0


In [34]:
transactions_by_customer['campaign_id'].value_counts()

0     886558
26      2339
30      1692
13      1165
27      1135
8       1052
28       944
10       875
4        798
2        688
11       603
6        577
29       545
12       514
7        476
9        432
5        325
22       164
16       107
24        71
52        63
14        38
60        34
3         34
20        21
58        17
18        16
54         9
56         8
39         2
21         1
81         1
Name: campaign_id, dtype: int64

Above, we see that due to the sum operation, some campaign_id values are now incorrect. We will remedy that next.

In [35]:
transactions_by_customer['size'] = transactions_valid[transactions_valid['coupon_discount']!=0].groupby(['customer_id', 'item_id', 'date'])['quantity'].count()
transactions_by_customer['campaign_id'] = transactions_by_customer['campaign_id']/transactions_by_customer['size']
transactions_by_customer['campaign_id'].value_counts()

26.0    2257
30.0    1726
13.0    1312
8.0     1149
27.0    1145
28.0     952
10.0     892
4.0      802
11.0     767
2.0      694
6.0      658
29.0     562
7.0      515
12.0     501
9.0      448
5.0      329
3.0       37
Name: campaign_id, dtype: int64

We don't need the size column any longer, so we drop it next

In [36]:
transactions_by_customer.drop('size', axis=1, inplace=True)
transactions_by_customer.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,quantity,selling_price,other_discount,coupon_discount,campaign_id
customer_id,item_id,date,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,4315,2013-06-20,1,201.97,0.0,0.0,
1,4577,2012-11-07,2,120.4,0.0,0.0,
1,4796,2012-10-18,1,106.5,0.0,0.0,
1,5126,2013-03-12,1,95.82,0.0,0.0,
1,5126,2013-06-24,1,95.82,0.0,0.0,


Next we merge this with our customer and item data

In [37]:
customers.set_index('customer_id', inplace=True)
items.set_index('item_id', inplace=True)

In [38]:
transactions_by_customer = transactions_by_customer.join(customers, how='left')
transactions_by_customer.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,quantity,selling_price,other_discount,coupon_discount,campaign_id,age_range,marital_status,rented,family_size,no_of_children,income_bracket
customer_id,item_id,date,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,4315,2013-06-20,1,201.97,0.0,0.0,,70+,Married,0.0,2,,4.0
1,4577,2012-11-07,2,120.4,0.0,0.0,,70+,Married,0.0,2,,4.0
1,4796,2012-10-18,1,106.5,0.0,0.0,,70+,Married,0.0,2,,4.0
1,5126,2013-03-12,1,95.82,0.0,0.0,,70+,Married,0.0,2,,4.0
1,5126,2013-06-24,1,95.82,0.0,0.0,,70+,Married,0.0,2,,4.0


In [39]:
transactions_by_customer = transactions_by_customer.join(items, how='left')
transactions_by_customer.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,quantity,selling_price,other_discount,coupon_discount,campaign_id,age_range,marital_status,rented,family_size,no_of_children,income_bracket,brand,brand_type,category
customer_id,item_id,date,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,4315,2013-06-20,1,201.97,0.0,0.0,,70+,Married,0.0,2,,4.0,2902,Established,Pharmaceutical
1,4577,2012-11-07,2,120.4,0.0,0.0,,70+,Married,0.0,2,,4.0,115,Established,Grocery
1,4796,2012-10-18,1,106.5,0.0,0.0,,70+,Married,0.0,2,,4.0,278,Established,Grocery
1,5126,2013-03-12,1,95.82,0.0,0.0,,70+,Married,0.0,2,,4.0,119,Established,Grocery
1,5126,2013-06-24,1,95.82,0.0,0.0,,70+,Married,0.0,2,,4.0,119,Established,Grocery


Now we want to extract information from date into new columns

In [40]:
transactions_by_customer.reset_index(level=2, inplace=True)
transactions_by_customer.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,date,quantity,selling_price,other_discount,coupon_discount,campaign_id,age_range,marital_status,rented,family_size,no_of_children,income_bracket,brand,brand_type,category
customer_id,item_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,4315,2013-06-20,1,201.97,0.0,0.0,,70+,Married,0.0,2,,4.0,2902,Established,Pharmaceutical
1,4577,2012-11-07,2,120.4,0.0,0.0,,70+,Married,0.0,2,,4.0,115,Established,Grocery
1,4796,2012-10-18,1,106.5,0.0,0.0,,70+,Married,0.0,2,,4.0,278,Established,Grocery
1,5126,2013-03-12,1,95.82,0.0,0.0,,70+,Married,0.0,2,,4.0,119,Established,Grocery
1,5126,2013-06-24,1,95.82,0.0,0.0,,70+,Married,0.0,2,,4.0,119,Established,Grocery


In [41]:
transactions_by_customer['day'] = transactions_by_customer['date'].dt.day
transactions_by_customer['month'] = transactions_by_customer['date'].dt.month
transactions_by_customer['year'] = transactions_by_customer['date'].dt.year
transactions_by_customer['week'] = transactions_by_customer['date'].dt.week
transactions_by_customer['weekday'] = transactions_by_customer['date'].dt.weekday
transactions_by_customer.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,date,quantity,selling_price,other_discount,coupon_discount,campaign_id,age_range,marital_status,rented,family_size,no_of_children,income_bracket,brand,brand_type,category,day,month,year,week,weekday
customer_id,item_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4315,2013-06-20,1,201.97,0.0,0.0,,70+,Married,0.0,2,,4.0,2902,Established,Pharmaceutical,20,6,2013,25,3
1,4577,2012-11-07,2,120.4,0.0,0.0,,70+,Married,0.0,2,,4.0,115,Established,Grocery,7,11,2012,45,2
1,4796,2012-10-18,1,106.5,0.0,0.0,,70+,Married,0.0,2,,4.0,278,Established,Grocery,18,10,2012,42,3
1,5126,2013-03-12,1,95.82,0.0,0.0,,70+,Married,0.0,2,,4.0,119,Established,Grocery,12,3,2013,11,1
1,5126,2013-06-24,1,95.82,0.0,0.0,,70+,Married,0.0,2,,4.0,119,Established,Grocery,24,6,2013,26,0


In [42]:
transactions_by_customer['weekday'].value_counts()

4    154423
3    148261
5    127324
2    124924
6    117613
0    115036
1    113723
Name: weekday, dtype: int64

In [43]:
mymap = {0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 4:'Friday', 5:'Saturday', 6:'Sunday'}
transactions_by_customer['weekday'] = transactions_by_customer['weekday'].map(mymap)
transactions_by_customer.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,date,quantity,selling_price,other_discount,coupon_discount,campaign_id,age_range,marital_status,rented,family_size,no_of_children,income_bracket,brand,brand_type,category,day,month,year,week,weekday
customer_id,item_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4315,2013-06-20,1,201.97,0.0,0.0,,70+,Married,0.0,2,,4.0,2902,Established,Pharmaceutical,20,6,2013,25,Thursday
1,4577,2012-11-07,2,120.4,0.0,0.0,,70+,Married,0.0,2,,4.0,115,Established,Grocery,7,11,2012,45,Wednesday
1,4796,2012-10-18,1,106.5,0.0,0.0,,70+,Married,0.0,2,,4.0,278,Established,Grocery,18,10,2012,42,Thursday
1,5126,2013-03-12,1,95.82,0.0,0.0,,70+,Married,0.0,2,,4.0,119,Established,Grocery,12,3,2013,11,Tuesday
1,5126,2013-06-24,1,95.82,0.0,0.0,,70+,Married,0.0,2,,4.0,119,Established,Grocery,24,6,2013,26,Monday


In [44]:
transactions_by_customer['month_year'] = transactions_by_customer['date'].dt.strftime('%m-%Y')
transactions_by_customer.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,date,quantity,selling_price,other_discount,coupon_discount,campaign_id,age_range,marital_status,rented,family_size,...,income_bracket,brand,brand_type,category,day,month,year,week,weekday,month_year
customer_id,item_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,4315,2013-06-20,1,201.97,0.0,0.0,,70+,Married,0.0,2,...,4.0,2902,Established,Pharmaceutical,20,6,2013,25,Thursday,06-2013
1,4577,2012-11-07,2,120.4,0.0,0.0,,70+,Married,0.0,2,...,4.0,115,Established,Grocery,7,11,2012,45,Wednesday,11-2012
1,4796,2012-10-18,1,106.5,0.0,0.0,,70+,Married,0.0,2,...,4.0,278,Established,Grocery,18,10,2012,42,Thursday,10-2012
1,5126,2013-03-12,1,95.82,0.0,0.0,,70+,Married,0.0,2,...,4.0,119,Established,Grocery,12,3,2013,11,Tuesday,03-2013
1,5126,2013-06-24,1,95.82,0.0,0.0,,70+,Married,0.0,2,...,4.0,119,Established,Grocery,24,6,2013,26,Monday,06-2013
