In [59]:
# !pip install --upgrade pip --quiet
# !pip install -r requirements.txt --quiet

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import time
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score as roc_auc_score


In [60]:
def load_data():
#     current_file = os.path.abspath(os.path.dirname())
    current_file = ""

    csv_filename = os.path.join(current_file, './cleaned_data/campaign_data.csv')
    campaign_data = pd.read_csv(csv_filename)

    csv_filename = os.path.join(current_file, './cleaned_data/coupon_item_mapping_data.csv')
    coupon_item_mapping_data = pd.read_csv(csv_filename)

    csv_filename = os.path.join(current_file, './cleaned_data/customer_demographics_data.csv')
    customer_demographics_data = pd.read_csv(csv_filename)

    csv_filename = os.path.join(current_file, './cleaned_data/customer_transaction_data.csv')
    customer_transaction_data = pd.read_csv(csv_filename)

    csv_filename = os.path.join(current_file, './cleaned_data/item_data.csv')
    item_data = pd.read_csv(csv_filename)

    csv_filename = os.path.join(current_file, './cleaned_data/train_data.csv')
    train_data = pd.read_csv(csv_filename)

    csv_filename = os.path.join(current_file, '../test_data/test_QyjYwdj.csv')
    test_data = pd.read_csv(csv_filename)
    
    return campaign_data, coupon_item_mapping_data, customer_demographics_data, \
           customer_transaction_data, item_data, train_data, test_data

In [61]:
campaign_data, coupon_item_mapping_data, customer_demographics_data, \
customer_transaction_data, item_data, train_data, test_data = load_data()


In [62]:
!mkdir submissions

def save_submission(predictions_probability):
    predictions_probability = pd.DataFrame(predictions_probability)[1]
    submission_df = pd.concat([test_data['id'], pd.Series(predictions_probability, name="redemption_status")], axis=1)

    submission_file_name = "submissions/" + str(int(time.time())) + ".csv"

    submission_df.to_csv(submission_file_name, index=False)

mkdir: submissions: File exists


In [63]:
def print_evaluation_matrix(train_data, predictions, predictions_probability):
    print(confusion_matrix(train_data['redemption_status'], predictions))
    print(roc_auc_score(train_data['redemption_status'], pd.DataFrame(predictions_probability)[1]))


In [64]:
# Baseline model
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(solver='liblinear')
classifier.fit(train_data.drop(columns=['id', 'redemption_status']), train_data['redemption_status'])

predictions_test = classifier.predict(test_data.drop(columns=['id']))
predictions_test_probability = classifier.predict_proba(test_data.drop(columns=['id']))

predictions_train = classifier.predict(train_data.drop(columns=['id', 'redemption_status']))
predictions_train_probability = classifier.predict_proba(train_data.drop(columns=['id', 'redemption_status']))

save_submission(predictions_test_probability)

In [65]:
print_evaluation_matrix(train_data, predictions_train, predictions_train_probability)

[[77640     0]
 [  729     0]]
0.5746672942333827


In [89]:
campaign_data.sort_values(['start_date', 'end_date'])

Unnamed: 0,campaign_id,campaign_type,start_date,end_date,duration_in_days
27,26,X,2012-08-12,2012-09-21,40 days
26,27,Y,2012-08-25,2012-10-27,63 days
25,28,Y,2012-09-16,2012-11-16,61 days
24,29,Y,2012-10-08,2012-11-30,53 days
23,30,X,2012-11-19,2013-01-04,46 days
21,1,Y,2012-12-12,2013-01-18,37 days
22,2,Y,2012-12-17,2013-01-18,32 days
18,3,Y,2012-12-22,2013-02-16,56 days
20,4,Y,2013-01-07,2013-02-08,32 days
19,5,Y,2013-01-12,2013-02-15,34 days


In [85]:
campaign_data.start_date = pd.to_datetime(campaign_data.start_date)
campaign_data.end_date = pd.to_datetime(campaign_data.end_date)

campaign_data['duration_in_days'] = campaign_data.end_date - campaign_data.start_date

In [94]:
overlapping_campaigns = []

for i in campaign_data.index: 
    start_date_in_question = campaign_data.start_date[i]
    end_date_in_question = campaign_data.end_date[i]
    
    cond_1 = (campaign_data.start_date < end_date_in_question) & (campaign_data.end_date > end_date_in_question) 
    cond_2 = (campaign_data.start_date < start_date_in_question) & (campaign_data.end_date > start_date_in_question) 
    cond = (cond_1) | (cond_2)

    overlapping_campaigns_count = campaign_data.loc[cond].shape[0]
    overlapping_campaigns.append(overlapping_campaigns_count)
    
campaign_data['overlapping_campaigns'] = pd.Series(overlapping_campaigns, name='overlapping_campaigns')

In [95]:
campaign_data

Unnamed: 0,campaign_id,campaign_type,start_date,end_date,duration_in_days,overlapping_campaigns
0,24,Y,2013-10-21,2013-12-20,60 days,2
1,25,Y,2013-10-21,2013-11-22,32 days,3
2,20,Y,2013-09-07,2013-11-16,70 days,4
3,23,Y,2013-10-08,2013-11-15,38 days,5
4,21,Y,2013-09-16,2013-10-18,32 days,4
5,22,X,2013-09-16,2013-10-18,32 days,4
6,18,X,2013-08-10,2013-10-04,55 days,5
7,19,Y,2013-08-26,2013-09-27,32 days,5
8,17,Y,2013-07-29,2013-08-30,32 days,3
9,16,Y,2013-07-15,2013-08-16,32 days,2
