In [47]:
# !pip install --upgrade pip --quiet
# !pip install -r requirements.txt --quiet
!pip install tqdm

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import time
import numpy as np  
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score as roc_auc_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LogisticRegression
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

import datetime
from dateutil.rrule import rrule, DAILY


Collecting tqdm
[?25l  Downloading https://files.pythonhosted.org/packages/e1/c1/bc1dba38b48f4ae3c4428aea669c5e27bd5a7642a74c8348451e0bd8ff86/tqdm-4.36.1-py2.py3-none-any.whl (52kB)
[K     |████████████████████████████████| 61kB 198kB/s eta 0:00:01
[?25hInstalling collected packages: tqdm
Successfully installed tqdm-4.36.1


  from pandas import Panel


In [54]:
def load_data():
#     current_file = os.path.abspath(os.path.dirname())
    current_file = ""

    csv_filename = os.path.join(current_file, './cleaned_data/campaign_data.csv')
    campaign_data = pd.read_csv(csv_filename, parse_dates=['start_date', 'end_date'])

    csv_filename = os.path.join(current_file, './cleaned_data/coupon_item_mapping_data.csv')
    coupon_item_mapping_data = pd.read_csv(csv_filename)

    csv_filename = os.path.join(current_file, './cleaned_data/customer_demographics_data.csv')
    customer_demographics_data = pd.read_csv(csv_filename)

    csv_filename = os.path.join(current_file, './cleaned_data/customer_transaction_data.csv')
    customer_transaction_data = pd.read_csv(csv_filename, parse_dates=['date'])

    csv_filename = os.path.join(current_file, './cleaned_data/item_data.csv')
    item_data = pd.read_csv(csv_filename)

    csv_filename = os.path.join(current_file, './cleaned_data/train_data.csv')
    train_data = pd.read_csv(csv_filename)

    csv_filename = os.path.join(current_file, '../test_data/test_QyjYwdj.csv')
    test_data = pd.read_csv(csv_filename)
    
    return campaign_data, coupon_item_mapping_data, customer_demographics_data, \
           customer_transaction_data, item_data, train_data, test_data

In [131]:
campaign_data_pure, coupon_item_mapping_data_pure, customer_demographics_data_pure, \
customer_transaction_data_pure, item_data_pure, train_data_pure, test_data_pure  = load_data()

customer_transaction_data_pure['cost_price'] = \
customer_transaction_data_pure.selling_price - \
customer_transaction_data_pure.coupon_discount - \
customer_transaction_data_pure.other_discount

customer_transaction_data_pure['rate'] = \
customer_transaction_data_pure['cost_price'] / customer_transaction_data_pure['quantity']



In [56]:
campaign_data, coupon_item_mapping_data, customer_demographics_data, \
customer_transaction_data, item_data, train_data, test_data \
= campaign_data_pure, coupon_item_mapping_data_pure, customer_demographics_data_pure, \
customer_transaction_data_pure, item_data_pure, train_data_pure, test_data_pure


In [57]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

In [58]:
def custom_one_hot_encoder(df, column_name):
    one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
    category_df = pd.DataFrame(df[column_name])
    encoded_categories = one_hot_encoder.fit_transform(category_df)
    
    encoded_df = pd.DataFrame(encoded_categories.toarray(), columns=one_hot_encoder.get_feature_names([column_name]))
   
    return pd.concat([df, encoded_df], axis=1)


In [59]:
def find_overlap_duration(start_date, end_date, intervals_df):
    dates_to_check_against = set(
        [dt for dt in rrule(DAILY, dtstart=start_date, until=end_date)])
    
    number_of_dates_originally = len(dates_to_check_against)
    
    for i in intervals_df.index:
        dates_to_check = set(
        [dt for dt in rrule(DAILY, dtstart=intervals_df.start_date[i], until=intervals_df.end_date[i])])
    
        dates_to_check_against = dates_to_check_against.difference(dates_to_check)
        
    
    return number_of_dates_originally - len(dates_to_check_against)

In [60]:
def analyse_campaign_runs(campaign_data, custom_parameter=False):

    campaign_data.start_date = pd.to_datetime(campaign_data.start_date)
    campaign_data.end_date = pd.to_datetime(campaign_data.end_date)


    same_type_overlapping_campaigns = []
    other_type_overlapping_campaigns = []
    same_type_overlap_duration = []
    other_type_overlap_duration = []
    duration_in_days = []
    months = []
    number_of_weekends = []

    for i in campaign_data.index: 
        campaign_id_in_question = campaign_data.campaign_id[i]
        campaign_type_in_question = campaign_data.campaign_type[i]
        start_date_in_question = campaign_data.start_date[i]
        end_date_in_question = campaign_data.end_date[i]

        cond_2 = (campaign_data.start_date <= start_date_in_question) & (campaign_data.end_date >= start_date_in_question) 
        cond_1 = (campaign_data.start_date >= start_date_in_question) & (campaign_data.start_date <= end_date_in_question) 
        cond_overlapping_campaigns = (cond_1) | (cond_2)

        cond_same_type = (campaign_data.campaign_type == campaign_type_in_question) 
        cond_other_type = (campaign_data.campaign_type != campaign_type_in_question) 

        cond_same_type = (cond_overlapping_campaigns) & (cond_same_type)
        cond_other_type = (cond_overlapping_campaigns) & (cond_other_type)

        # Same-type metrics
        same_type_campaigns = campaign_data.loc[(cond_same_type) \
                                                & (campaign_data.campaign_id != campaign_id_in_question)]

        same_type_overlapping_campaigns_count = same_type_campaigns.shape[0]
        same_type_overlapping_campaigns.append(same_type_overlapping_campaigns_count)

        same_type_overlap_duration_for_this_campaign = find_overlap_duration(\
            start_date_in_question, end_date_in_question, same_type_campaigns[['start_date', 'end_date']])
        same_type_overlap_duration.append(same_type_overlap_duration_for_this_campaign)


        # Other-type metrics
        other_type_campaigns = campaign_data.loc[cond_other_type]

        other_type_overlapping_campaigns_count = other_type_campaigns.shape[0]
        other_type_overlapping_campaigns.append(other_type_overlapping_campaigns_count)

        other_type_overlap_duration_for_this_campaign = find_overlap_duration(\
            start_date_in_question, end_date_in_question, other_type_campaigns[['start_date', 'end_date']])
        other_type_overlap_duration.append(other_type_overlap_duration_for_this_campaign)


        # Months Running
        dates_running = set(rrule(DAILY, dtstart=start_date_in_question, until=end_date_in_question))
        months_for_this_challenge = list(set([dt.month for dt in dates_running]))
        months.append(months_for_this_challenge)

        # Duration
        duration_in_days_for_this_challenge = len(dates_running)
        duration_in_days.append(duration_in_days_for_this_challenge)
        
        # Weekends
        number_of_weekends_for_this_challenge = sum([dt.isoweekday() >= 6 for dt in list(dates_running)])
        number_of_weekends.append(number_of_weekends_for_this_challenge)
        
    campaign_data['duration_in_days'] = pd.Series(duration_in_days, name='duration_in_days')
    campaign_data['months'] = pd.Series(months, name='months')
    campaign_data['same_type_overlapping_campaigns'] = pd.Series(same_type_overlapping_campaigns, name='same_type_overlapping_campaigns')
    campaign_data['other_type_overlapping_campaigns'] = pd.Series(other_type_overlapping_campaigns, name='other_type_overlapping_campaigns_count')
    campaign_data['same_type_overlap_duration'] = pd.Series(same_type_overlap_duration, name='same_type_overlap_duration')
    campaign_data['other_type_overlap_duration'] = pd.Series(other_type_overlap_duration, name='other_type_overlap_duration')
    campaign_data['number_of_weekends'] = pd.Series(number_of_weekends, name='number_of_weekends')

#     return np.c_[campaign_data]
    return campaign_data

In [62]:
def month_feature_binarizer(df):
    months_array = df['months']
    multi_label_binarizer = MultiLabelBinarizer()
    months_encoded = multi_label_binarizer.fit_transform(months_array)

    months_encoded_df = pd.DataFrame(months_encoded, columns=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
    return pd.concat([df, months_encoded_df], axis=1)


In [63]:
attributes_to_process = list(campaign_data.columns.values)

campaign_type_processor = Pipeline([
    ('selector', DataFrameSelector(attributes_to_process)),
    ('analyzer', FunctionTransformer(analyse_campaign_runs, validate=False, 
                                             kw_args={"custom_parameter": False})),
    ('month_labelizer', FunctionTransformer(month_feature_binarizer, validate=False)),
    ('one_hot_encoder', FunctionTransformer(custom_one_hot_encoder, validate=False, 
                                            kw_args={"column_name": 'campaign_type'}))
])

In [64]:
campaign_data_with_old_and_new_features = campaign_type_processor.fit_transform(campaign_data)

In [65]:
campaign_data_with_old_and_new_features.columns

Index(['campaign_id', 'campaign_type', 'start_date', 'end_date',
       'duration_in_days', 'months', 'same_type_overlapping_campaigns',
       'other_type_overlapping_campaigns', 'same_type_overlap_duration',
       'other_type_overlap_duration', 'number_of_weekends', 'Jan', 'Feb',
       'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec',
       'campaign_type_X', 'campaign_type_Y'],
      dtype='object')

In [67]:
campaign_data_with_only_new_features = \
campaign_data_with_old_and_new_features.drop(columns=['start_date', 'end_date', 'campaign_type', 'months'])

campaign_data_with_only_new_features.columns


Index(['campaign_id', 'duration_in_days', 'same_type_overlapping_campaigns',
       'other_type_overlapping_campaigns', 'same_type_overlap_duration',
       'other_type_overlap_duration', 'number_of_weekends', 'Jan', 'Feb',
       'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec',
       'campaign_type_X', 'campaign_type_Y'],
      dtype='object')

## number of transaction by customer before start of the respective campaign

In [71]:
train_and_campaign_data = train_data.merge(campaign_data_pure)


In [126]:
campaign_customer_combinations = \
train_and_campaign_data[['campaign_id', 'customer_id', 'start_date', 'end_date']].drop_duplicates()

In [282]:
campaign_customer_combinations

Unnamed: 0,campaign_id,customer_id,start_date,end_date
0,13,1053,2013-05-19,2013-07-05
1,13,48,2013-05-19,2013-07-05
2,13,1050,2013-05-19,2013-07-05
3,13,89,2013-05-19,2013-07-05
4,13,1067,2013-05-19,2013-07-05
...,...,...,...,...
50173,24,1017,2013-10-21,2013-12-20
50176,24,710,2013-10-21,2013-12-20
50177,24,368,2013-10-21,2013-12-20
50182,24,685,2013-10-21,2013-12-20


In [283]:
transactions_by_customer_till_campaign_start_features = \
campaign_customer_combinations\
.progress_apply(lambda row: pd.Series(analyse_transactions_by_customer_till_campaign_start(row),\
                                      index=['campaign_id_in_question', \
                                             'customer_id_in_question', \
                                             'no_of_transactions_by_customer_till_campaign_start', \
                                             'percentage_of_transactions_by_customer_till_campaign_start_with_any_discount', \
                                             'percentage_of_transactions_by_customer_till_campaign_start_with_coupon_discount', \
                                             'transaction_amount_by_customer_till_campaign_start', \
                                             'transaction_amount_by_customer_till_campaign_start_where_customer_got_any_discount', \
                                             'transaction_amount_by_customer_till_campaign_start_where_customer_got_coupon_discount', \
                                             'average_percent_discount_by_customer_till_campaign_start']),\
                axis=1)
print(transactions_by_customer_till_campaign_start_features.head())

HBox(children=(IntProgress(value=0, max=6967), HTML(value='')))


   campaign_id_in_question  customer_id_in_question  \
0                     13.0                   1053.0   
1                     13.0                     48.0   
2                     13.0                   1050.0   
3                     13.0                     89.0   
4                     13.0                   1067.0   

   no_of_transactions_by_customer_till_campaign_start  \
0                                              270.0    
1                                              354.0    
2                                              234.0    
3                                              573.0    
4                                              868.0    

   percentage_of_transactions_by_customer_till_campaign_start_with_any_discount  \
0                                           0.533333                              
1                                           0.567797                              
2                                           0.512821                        

In [281]:
def analyse_transactions_by_customer_till_campaign_start(row):
    customer_id_in_question = row.customer_id
    campaign_id_in_question = row.campaign_id
    start_date_in_question = row.start_date
    end_date_in_question = row.end_date
        
    # DF filtering
    cond_for_previous_transactions = (customer_transaction_data_pure.customer_id == customer_id_in_question) \
    & (customer_transaction_data_pure.date < start_date_in_question)
    
    transactions_by_customer_till_campaign_start = \
    customer_transaction_data_pure.loc[cond_for_previous_transactions]
    
    cond_for_any_discount = (transactions_by_customer_till_campaign_start.coupon_discount < 0) \
                     | (transactions_by_customer_till_campaign_start.other_discount < 0)

    transactions_by_customer_till_campaign_start_with_any_discount = \
    transactions_by_customer_till_campaign_start.loc[cond_for_any_discount]

    
    cond_for_coupon_discount = (transactions_by_customer_till_campaign_start.coupon_discount < 0)
    
    transactions_by_customer_till_campaign_start_with_coupon_discount = \
    transactions_by_customer_till_campaign_start_with_any_discount.loc[cond_for_coupon_discount]


    
    # Number of transaction metrics
    no_of_transactions_by_customer_till_campaign_start = \
    transactions_by_customer_till_campaign_start.count().date
    
    percentage_of_transactions_by_customer_till_campaign_start_with_any_discount = \
    transactions_by_customer_till_campaign_start_with_any_discount.count().date \
    / no_of_transactions_by_customer_till_campaign_start

    percentage_of_transactions_by_customer_till_campaign_start_with_coupon_discount = \
    transactions_by_customer_till_campaign_start_with_coupon_discount.count().date \
    / no_of_transactions_by_customer_till_campaign_start

    # Transaction amout metrics
    transaction_amount_by_customer_till_campaign_start = \
    (transactions_by_customer_till_campaign_start.cost_price).sum()
    
    transaction_amount_by_customer_till_campaign_start_where_customer_got_any_discount = \
    (transactions_by_customer_till_campaign_start_with_any_discount.cost_price).sum()

    transaction_amount_by_customer_till_campaign_start_where_customer_got_coupon_discount = \
    (transactions_by_customer_till_campaign_start_with_coupon_discount.cost_price).sum()

    average_percent_discount_by_customer_till_campaign_start = \
    ((transactions_by_customer_till_campaign_start.coupon_discount \
     + transactions_by_customer_till_campaign_start.other_discount) \
     / transactions_by_customer_till_campaign_start.cost_price).mean()
    
#     print(no_of_transactions_by_customer_till_campaign_start)
#     print(percentage_of_transactions_by_customer_till_campaign_start_with_any_discount)
#     print(percentage_of_transactions_by_customer_till_campaign_start_with_coupon_discount)
#     print(transaction_amount_by_customer_till_campaign_start)
#     print(transaction_amount_by_customer_till_campaign_start_where_customer_got_any_discount)
#     print(transaction_amount_by_customer_till_campaign_start_where_customer_got_coupon_discount)
#     print(average_percent_discount_by_customer_till_campaign_start)
#     print([no_of_transactions_by_customer_till_campaign_start, \
# percentage_of_transactions_by_customer_till_campaign_start_with_any_discount, \
# percentage_of_transactions_by_customer_till_campaign_start_with_coupon_discount, \
# transaction_amount_by_customer_till_campaign_start, \
# transaction_amount_by_customer_till_campaign_start_where_customer_got_any_discount, \
# transaction_amount_by_customer_till_campaign_start_where_customer_got_coupon_discount, \
# average_percent_discount_by_customer_till_campaign_start])

    return [campaign_id_in_question, \
            customer_id_in_question, \
            no_of_transactions_by_customer_till_campaign_start, \
            percentage_of_transactions_by_customer_till_campaign_start_with_any_discount, \
            percentage_of_transactions_by_customer_till_campaign_start_with_coupon_discount, \
            transaction_amount_by_customer_till_campaign_start, \
            transaction_amount_by_customer_till_campaign_start_where_customer_got_any_discount, \
            transaction_amount_by_customer_till_campaign_start_where_customer_got_coupon_discount, \
            average_percent_discount_by_customer_till_campaign_start]
     
    
    

In [None]:
def get_date_in_current_year(old_date):
    try:
        new_date = datetime.date(year=2020, month=old_date.month, day=old_date.day)
    except:
        print("error in:", old_date)
    return new_date

customer_transaction_data_pure['dummy_date'] = customer_transaction_data_pure.date.apply(lambda x : get_date_in_current_year(x))


In [None]:
transactions_by_customer_during_similar_period_as_campaign_features = \
campaign_customer_combinations\
.progress_apply(lambda row: pd.Series(analyse_transactions_by_customer_during_similar_period_as_campaign(row),\
                                      index=['campaign_id_in_question', \
                                             'customer_id_in_question', \
                                             'no_of_transactions_by_customer_for_similar_period', \
                                             'percentage_of_transactions_by_customer_for_similar_period_with_any_discount', \
                                             'percentage_of_transactions_by_customer_for_similar_period_with_coupon_discount', \
                                             'transaction_amount_by_customer_for_similar_period', \
                                             'transaction_amount_by_customer_for_similar_period_where_customer_got_any_discount', \
                                             'transaction_amount_by_customer_for_similar_period_where_customer_got_coupon_discount', \
                                             'average_percent_discount_by_customer_for_similar_period']),\
                axis=1)
print(transactions_by_customer_till_campaign_start_features.head())

HBox(children=(IntProgress(value=0, max=6967), HTML(value='')))

In [285]:
def analyse_transactions_by_customer_during_similar_period_as_campaign(row):
    customer_id_in_question = row.customer_id
    campaign_id_in_question = row.campaign_id
    start_date_in_question = row.start_date
    end_date_in_question = row.end_date
    dates_in_question = [datetime.date(2020, dt.month, dt.day) for dt in rrule(DAILY, dtstart=start_date_in_question, until=end_date_in_question)]
        
    
    cond_for_similar_period_transactions = (customer_transaction_data_pure.customer_id == customer_id_in_question) \
    & ( customer_transaction_data_pure.dummy_date.isin(dates_in_question) )
    
                      
    transactions_by_customer_for_similar_period = \
    customer_transaction_data_pure.loc[cond_for_similar_period_transactions]
    
    cond_for_any_discount = (transactions_by_customer_for_similar_period.coupon_discount < 0) \
                     | (transactions_by_customer_for_similar_period.other_discount < 0)

    transactions_by_customer_for_similar_period_with_any_discount = \
    transactions_by_customer_for_similar_period.loc[cond_for_any_discount]

    
    cond_for_coupon_discount = (transactions_by_customer_for_similar_period.coupon_discount < 0)
    
    transactions_by_customer_for_similar_period_with_coupon_discount = \
    transactions_by_customer_for_similar_period_with_any_discount.loc[cond_for_coupon_discount]


    
    # Number of transaction metrics
    no_of_transactions_by_customer_for_similar_period = \
    transactions_by_customer_for_similar_period.count().date
    
    percentage_of_transactions_by_customer_for_similar_period_with_any_discount = \
    transactions_by_customer_for_similar_period_with_any_discount.count().date \
    / no_of_transactions_by_customer_for_similar_period

    percentage_of_transactions_by_customer_for_similar_period_with_coupon_discount = \
    transactions_by_customer_for_similar_period_with_coupon_discount.count().date \
    / no_of_transactions_by_customer_for_similar_period

    # Transaction amout metrics
    transaction_amount_by_customer_for_similar_period = \
    (transactions_by_customer_for_similar_period.cost_price).sum()
    
    transaction_amount_by_customer_for_similar_period_where_customer_got_any_discount = \
    (transactions_by_customer_for_similar_period_with_any_discount.cost_price).sum()

    transaction_amount_by_customer_for_similar_period_where_customer_got_coupon_discount = \
    (transactions_by_customer_for_similar_period_with_coupon_discount.cost_price).sum()

    average_percent_discount_by_customer_for_similar_period = \
    ((transactions_by_customer_for_similar_period.coupon_discount \
     + transactions_by_customer_for_similar_period.other_discount) \
     / transactions_by_customer_for_similar_period.cost_price).mean()
    
#     print(no_of_transactions_by_customer_till_campaign_start)
#     print(percentage_of_transactions_by_customer_till_campaign_start_with_any_discount)
#     print(percentage_of_transactions_by_customer_till_campaign_start_with_coupon_discount)
#     print(transaction_amount_by_customer_till_campaign_start)
#     print(transaction_amount_by_customer_till_campaign_start_where_customer_got_any_discount)
#     print(transaction_amount_by_customer_till_campaign_start_where_customer_got_coupon_discount)
#     print(average_percent_discount_by_customer_till_campaign_start)
#     print([campaign_id_in_question, \
#             customer_id_in_question, \
#             no_of_transactions_by_customer_for_similar_period, \
#             percentage_of_transactions_by_customer_for_similar_period_with_any_discount, \
#             percentage_of_transactions_by_customer_for_similar_period_with_coupon_discount, \
#             transaction_amount_by_customer_for_similar_period, \
#             transaction_amount_by_customer_for_similar_period_where_customer_got_any_discount, \
#             transaction_amount_by_customer_for_similar_period_where_customer_got_coupon_discount, \
#             average_percent_discount_by_customer_for_similar_period])

    return [campaign_id_in_question, \
            customer_id_in_question, \
            no_of_transactions_by_customer_for_similar_period, \
            percentage_of_transactions_by_customer_for_similar_period_with_any_discount, \
            percentage_of_transactions_by_customer_for_similar_period_with_coupon_discount, \
            transaction_amount_by_customer_for_similar_period, \
            transaction_amount_by_customer_for_similar_period_where_customer_got_any_discount, \
            transaction_amount_by_customer_for_similar_period_where_customer_got_coupon_discount, \
            average_percent_discount_by_customer_for_similar_period]
     
    
    

In [261]:
customer_transaction_data_pure

Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount,cost_price,rate,dummy_date
0,2012-01-02,1501,26830,1,35.26,-10.69,0.0,45.95,45.95,2020-01-02
1,2012-01-02,1501,54253,1,53.43,-13.89,0.0,67.32,67.32,2020-01-02
2,2012-01-02,1501,31962,1,106.50,-14.25,0.0,120.75,120.75,2020-01-02
3,2012-01-02,1501,33647,1,67.32,0.00,0.0,67.32,67.32,2020-01-02
4,2012-01-02,1501,48199,1,71.24,-28.14,0.0,99.38,99.38,2020-01-02
...,...,...,...,...,...,...,...,...,...,...
1324561,2013-06-30,1129,2777,1,284.60,-71.24,0.0,355.84,355.84,2020-06-30
1324562,2013-06-30,1129,2953,4,42.74,-28.50,0.0,71.24,17.81,2020-06-30
1324563,2013-06-30,1129,2971,6,64.12,-42.74,0.0,106.86,17.81,2020-06-30
1324564,2013-06-30,1129,46984,1,95.82,0.00,0.0,95.82,95.82,2020-06-30


In [270]:
train_and_campaign_data = train_data.merge(campaign_data_pure)
test_and_campaign_data = test_data.merge(campaign_data_pure)

In [271]:
campaign_customer_combinations_train = \
train_and_campaign_data[['campaign_id', 'customer_id', 'start_date', 'end_date']].drop_duplicates()

campaign_customer_combinations_test = \
test_and_campaign_data[['campaign_id', 'customer_id', 'start_date', 'end_date']].drop_duplicates()

In [279]:
campaign_customer_combinations = pd.concat([campaign_customer_combinations_train, campaign_customer_combinations_test])

In [284]:
customer_transaction_data_pure

Unnamed: 0,date,customer_id,item_id,quantity,selling_price,other_discount,coupon_discount,cost_price,rate,dummy_date
0,2012-01-02,1501,26830,1,35.26,-10.69,0.0,45.95,45.95,2020-01-02
1,2012-01-02,1501,54253,1,53.43,-13.89,0.0,67.32,67.32,2020-01-02
2,2012-01-02,1501,31962,1,106.50,-14.25,0.0,120.75,120.75,2020-01-02
3,2012-01-02,1501,33647,1,67.32,0.00,0.0,67.32,67.32,2020-01-02
4,2012-01-02,1501,48199,1,71.24,-28.14,0.0,99.38,99.38,2020-01-02
...,...,...,...,...,...,...,...,...,...,...
1324561,2013-06-30,1129,2777,1,284.60,-71.24,0.0,355.84,355.84,2020-06-30
1324562,2013-06-30,1129,2953,4,42.74,-28.50,0.0,71.24,17.81,2020-06-30
1324563,2013-06-30,1129,2971,6,64.12,-42.74,0.0,106.86,17.81,2020-06-30
1324564,2013-06-30,1129,46984,1,95.82,0.00,0.0,95.82,95.82,2020-06-30


In [None]:
transactions_by_customer_till_campaign_start_features.to_csv(path_or_buf="./transactions_by_customer_till_campaign_start_features.csv", index=False)

transactions_by_customer_during_similar_period_as_campaign_features.to_csv(path_or_buf="./transactions_by_customer_during_similar_period_as_campaign_features.csv", index=False)


