In [435]:
# !pip install --upgrade pip --quiet
# !pip install -r requirements.txt --quiet

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import time
import numpy as np  
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score as roc_auc_score
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

import datetime
from dateutil.rrule import rrule, DAILY


## Load Data

In [436]:
def load_data():
#     current_file = os.path.abspath(os.path.dirname())
    current_file = ""

    csv_filename = os.path.join(current_file, './cleaned_data/campaign_data.csv')
    campaign_data = pd.read_csv(csv_filename)

    csv_filename = os.path.join(current_file, './cleaned_data/coupon_item_mapping_data.csv')
    coupon_item_mapping_data = pd.read_csv(csv_filename)

    csv_filename = os.path.join(current_file, './cleaned_data/customer_demographics_data.csv')
    customer_demographics_data = pd.read_csv(csv_filename)

    csv_filename = os.path.join(current_file, './cleaned_data/customer_transaction_data.csv')
    customer_transaction_data = pd.read_csv(csv_filename)

    csv_filename = os.path.join(current_file, './cleaned_data/item_data.csv')
    item_data = pd.read_csv(csv_filename)

    csv_filename = os.path.join(current_file, './cleaned_data/train_data.csv')
    train_data = pd.read_csv(csv_filename)

    csv_filename = os.path.join(current_file, '../test_data/test_QyjYwdj.csv')
    test_data = pd.read_csv(csv_filename)
    
    return campaign_data, coupon_item_mapping_data, customer_demographics_data, \
           customer_transaction_data, item_data, train_data, test_data

In [718]:
campaign_data, coupon_item_mapping_data, customer_demographics_data, \
customer_transaction_data, item_data, train_data, test_data = load_data()


In [438]:
!mkdir submissions

def save_submission(predictions_probability):
    predictions_probability = pd.DataFrame(predictions_probability)[1]
    submission_df = pd.concat([test_data['id'], pd.Series(predictions_probability, name="redemption_status")], axis=1)

    submission_file_name = "submissions/" + str(int(time.time())) + ".csv"

    submission_df.to_csv(submission_file_name, index=False)

mkdir: submissions: File exists


In [439]:
def print_evaluation_matrix(train_data, predictions, predictions_probability):
    print(confusion_matrix(train_data['redemption_status'], predictions))
    print(roc_auc_score(train_data['redemption_status'], pd.DataFrame(predictions_probability)[1]))


In [410]:
# Baseline model
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(solver='liblinear')
classifier.fit(train_data.drop(columns=['id', 'redemption_status']), train_data['redemption_status'])

predictions_test = classifier.predict(test_data.drop(columns=['id']))
predictions_test_probability = classifier.predict_proba(test_data.drop(columns=['id']))

predictions_train = classifier.predict(train_data.drop(columns=['id', 'redemption_status']))
predictions_train_probability = classifier.predict_proba(train_data.drop(columns=['id', 'redemption_status']))

save_submission(predictions_test_probability)

In [411]:
print_evaluation_matrix(train_data, predictions_train, predictions_train_probability)

[[77640     0]
 [  729     0]]
0.5746672942333827


# Generic Functions

In [440]:

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]


In [714]:
def custom_one_hot_encoder(df, column_name):
    one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
    category_df = pd.DataFrame(df[column_name])
    encoded_categories = one_hot_encoder.fit_transform(category_df)
    
    encoded_df = pd.DataFrame(encoded_categories.toarray(), columns=one_hot_encoder.get_feature_names([column_name]))
   
    return pd.concat([df, encoded_df], axis=1)


## campaign_data feature extraction

In [442]:
def find_overlap_duration(start_date, end_date, intervals_df):
    dates_to_check_against = set(
        [dt for dt in rrule(DAILY, dtstart=start_date_in_question, until=end_date_in_question)])
    
    number_of_dates_originally = len(dates_to_check_against)
    
    for i in intervals_df.index:
        dates_to_check = set(
        [dt for dt in rrule(DAILY, dtstart=intervals_df.start_date[i], until=intervals_df.end_date[i])])
    
        dates_to_check_against = dates_to_check_against.difference(dates_to_check)
        
    
    return number_of_dates_originally - len(dates_to_check_against)

In [662]:
def analyse_campaign_runs(campaign_data, custom_parameter=False):

    campaign_data.start_date = pd.to_datetime(campaign_data.start_date)
    campaign_data.end_date = pd.to_datetime(campaign_data.end_date)


    same_type_overlapping_campaigns = []
    other_type_overlapping_campaigns = []
    same_type_overlap_duration = []
    other_type_overlap_duration = []
    duration_in_days = []
    months = []
    number_of_weekends = []

    for i in campaign_data.index: 
        campaign_type_in_question = campaign_data.campaign_type[i]
        start_date_in_question = campaign_data.start_date[i]
        end_date_in_question = campaign_data.end_date[i]

        cond_2 = (campaign_data.start_date <= start_date_in_question) & (campaign_data.end_date >= start_date_in_question) 
        cond_1 = (campaign_data.start_date >= start_date_in_question) & (campaign_data.start_date <= end_date_in_question) 
        cond_overlapping_campaigns = (cond_1) | (cond_2)

        cond_same_type = (campaign_data.campaign_type == campaign_type_in_question) 
        cond_other_type = (campaign_data.campaign_type != campaign_type_in_question) 

        cond_same_type = (cond_overlapping_campaigns) & (cond_same_type)
        cond_other_type = (cond_overlapping_campaigns) & (cond_other_type)

        # Same-type metrics
        same_type_campaigns = campaign_data.loc[cond_same_type]

        same_type_overlapping_campaigns_count = same_type_campaigns.shape[0] - 1
        same_type_overlapping_campaigns.append(same_type_overlapping_campaigns_count)

        same_type_overlap_duration_for_this_campaign = find_overlap_duration(\
            start_date_in_question, end_date_in_question, same_type_campaigns[['start_date', 'end_date']])
        same_type_overlap_duration.append(same_type_overlap_duration_for_this_campaign)


        # Other-type metrics
        other_type_campaigns = campaign_data.loc[cond_other_type]

        other_type_overlapping_campaigns_count = other_type_campaigns.shape[0]
        other_type_overlapping_campaigns.append(other_type_overlapping_campaigns_count)

        other_type_overlap_duration_for_this_campaign = find_overlap_duration(\
            start_date_in_question, end_date_in_question, other_type_campaigns[['start_date', 'end_date']])
        other_type_overlap_duration.append(other_type_overlap_duration_for_this_campaign)


        # Months Running
        dates_running = set(rrule(DAILY, dtstart=start_date_in_question, until=end_date_in_question))
        months_for_this_challenge = list(set([dt.month for dt in dates_running]))
        months.append(months_for_this_challenge)

        # Duration
        duration_in_days_for_this_challenge = len(dates_running)
        duration_in_days.append(duration_in_days_for_this_challenge)
        
        # Weekends
        number_of_weekends_for_this_challenge = sum([dt.isoweekday() >= 6 for dt in list(dates_running)])
        number_of_weekends.append(number_of_weekends_for_this_challenge)
        
    campaign_data['duration_in_days'] = pd.Series(duration_in_days, name='duration_in_days')
    campaign_data['months'] = pd.Series(months, name='months')
    campaign_data['same_type_overlapping_campaigns'] = pd.Series(same_type_overlapping_campaigns, name='same_type_overlapping_campaigns')
    campaign_data['other_type_overlapping_campaigns'] = pd.Series(other_type_overlapping_campaigns, name='other_type_overlapping_campaigns_count')
    campaign_data['same_type_overlap_duration'] = pd.Series(same_type_overlap_duration, name='same_type_overlap_duration')
    campaign_data['other_type_overlap_duration'] = pd.Series(other_type_overlap_duration, name='other_type_overlap_duration')
    campaign_data['number_of_weekends'] = pd.Series(number_of_weekends, name='number_of_weekends')

#     return np.c_[campaign_data]
    return campaign_data

In [444]:
def month_feature_binarizer(df):
    months_array = df['months']
    multi_label_binarizer = MultiLabelBinarizer()
    months_encoded = multi_label_binarizer.fit_transform(months_array)

    months_encoded_df = pd.DataFrame(months_encoded, columns=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
    return pd.concat([df, months_encoded_df], axis=1)


### Pipeline - for campaign_data table feature creation

In [719]:
attributes_to_process = list(campaign_data.columns.values)

campaign_type_processor = Pipeline([
    ('selector', DataFrameSelector(attributes_to_process)),
    ('analyzer', FunctionTransformer(analyse_campaign_runs, validate=False, 
                                             kw_args={"custom_parameter": False})),
    ('month_labelizer', FunctionTransformer(month_feature_binarizer, validate=False)),
    ('one_hot_encoder', FunctionTransformer(custom_one_hot_encoder, validate=False, 
                                            kw_args={"column_name": 'campaign_type'}))
])

In [720]:
campaign_data = campaign_type_processor.fit_transform(campaign_data)

In [727]:
campaign_data.to_csv(path_or_buf="./campaign_data_with_original_and_new_features.csv", index=False)

In [721]:
campaign_data.columns

Index(['campaign_id', 'campaign_type', 'start_date', 'end_date',
       'duration_in_days', 'months', 'same_type_overlapping_campaigns',
       'other_type_overlapping_campaigns', 'same_type_overlap_duration',
       'other_type_overlap_duration', 'number_of_weekends', 'Jan', 'Feb',
       'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec',
       'campaign_type_X', 'campaign_type_Y'],
      dtype='object')

In [724]:
campaign_data_new_features = campaign_data.drop(columns=['start_date', 'end_date', 'campaign_type', 'months'])
campaign_data_new_features.columns


Index(['campaign_id', 'duration_in_days', 'same_type_overlapping_campaigns',
       'other_type_overlapping_campaigns', 'same_type_overlap_duration',
       'other_type_overlap_duration', 'number_of_weekends', 'Jan', 'Feb',
       'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec',
       'campaign_type_X', 'campaign_type_Y'],
      dtype='object')

In [690]:
campaign_data.columns

Index([                     'campaign_id',                    'campaign_type',
                             'start_date',                         'end_date',
                       'duration_in_days',                           'months',
        'same_type_overlapping_campaigns', 'other_type_overlapping_campaigns',
             'same_type_overlap_duration',      'other_type_overlap_duration',
                     'number_of_weekends',                              'Jan',
                                    'Feb',                              'Mar',
                                    'Apr',                              'May',
                                    'Jun',                              'Jul',
                                    'Aug',                              'Sep',
                                    'Oct',                              'Nov',
                                    'Dec',                             ('X',),
                                   ('Y',)],
      dt

In [None]:
campaign_data_filtered = campaign_data.drop(columns=['months', 'start_date', 'end_date', 'campaign_type'])
campaign_data_filtered = campaign_data_filtered.reindex()

In [550]:
def over_sample(df, column_name, multiplier=2):
    for i in range(multiplier):
        df = pd.concat([df, df.loc[df[column_name] == 1]])
    return df

In [539]:
# Baseline model - 2
from sklearn.linear_model import LogisticRegression

train_data_to_be_used = train_data.merge(campaign_data_filtered, on=['campaign_id'], how='left') \
.drop(columns=['campaign_id', 'customer_id', 'coupon_id'])
test_data_to_be_used = test_data.merge(campaign_data_filtered, on=['campaign_id'], how='left') \
.drop(columns=['campaign_id', 'customer_id', 'coupon_id'])


In [542]:
classifier = LogisticRegression(solver='liblinear')
classifier.fit(train_data_to_be_used.drop(columns=['id', 'redemption_status']), train_data['redemption_status'])

predictions_test = classifier.predict(test_data_to_be_used.drop(columns=['id']))
predictions_test_probability = classifier.predict_proba(test_data_to_be_used.drop(columns=['id']))

predictions_train = classifier.predict(train_data_to_be_used.drop(columns=['id', 'redemption_status']))
predictions_train_probability = classifier.predict_proba(train_data_to_be_used.drop(columns=['id', 'redemption_status']))

save_submission(predictions_test_probability)

In [547]:
coef = pd.Series(classifier.coef_[0])
columns = pd.Series(train_data_to_be_used.drop(columns=['id', 'redemption_status']).columns)
feature_importance = pd.DataFrame()
feature_importance['column'] = columns
feature_importance['coef'] = coef
feature_importance.sort_values(['coef'])

Unnamed: 0,column,coef
15,Nov,-0.938946
18,"(Y,)",-0.900358
7,Mar,-0.613077
9,May,-0.606725
12,Aug,-0.432949
1,same_type_overlapping_campaigns,-0.427315
17,"(X,)",-0.313662
2,other_type_overlapping_campaigns,-0.258188
13,Sep,-0.204838
8,Apr,-0.135344


In [548]:
print_evaluation_matrix(train_data, predictions_train, predictions_train_probability)

[[77640     0]
 [  729     0]]
0.6267155522057062


In [480]:
# abc = pd.concat([xyz, xyz.loc[xyz.redemption_status == 1], xyz.loc[xyz.redemption_status == 1], xyz.loc[xyz.redemption_status == 1]])
