In [1]:
import pandas as pd

In [2]:
def create_dummy_columns(df, column_name):
    return pd.get_dummies(df, columns=[column_name], prefix=column_name)

## Profile

In [3]:
profile = pd.read_json('../data/profile.json', orient='records', lines=True)
profile.head()

Unnamed: 0,gender,age,id,became_member_on,income
0,,118,68be06ca386d4c31939f3a4f0e3dd783,20170212,
1,F,55,0610b486422d4921ae7d2bf64640c50b,20170715,112000.0
2,,118,38fe809add3b4fcf9315a9694bb96ff5,20180712,
3,F,75,78afa995795e4d85b5d9ceeca43f5fef,20170509,100000.0
4,,118,a03223e636434f42ac4c3df47e8bac43,20170804,


In [4]:
# Convert gender to dummies
profile = create_dummy_columns(profile,"gender")
profile.head()

Unnamed: 0,age,id,became_member_on,income,gender_F,gender_M,gender_O
0,118,68be06ca386d4c31939f3a4f0e3dd783,20170212,,False,False,False
1,55,0610b486422d4921ae7d2bf64640c50b,20170715,112000.0,True,False,False
2,118,38fe809add3b4fcf9315a9694bb96ff5,20180712,,False,False,False
3,75,78afa995795e4d85b5d9ceeca43f5fef,20170509,100000.0,True,False,False
4,118,a03223e636434f42ac4c3df47e8bac43,20170804,,False,False,False


In [5]:
# Convert became_member_on to datetime and create new column
profile['become_member_on_date'] = pd.to_datetime(profile['became_member_on'], format='%Y%m%d')
profile.head()

Unnamed: 0,age,id,became_member_on,income,gender_F,gender_M,gender_O,become_member_on_date
0,118,68be06ca386d4c31939f3a4f0e3dd783,20170212,,False,False,False,2017-02-12
1,55,0610b486422d4921ae7d2bf64640c50b,20170715,112000.0,True,False,False,2017-07-15
2,118,38fe809add3b4fcf9315a9694bb96ff5,20180712,,False,False,False,2018-07-12
3,75,78afa995795e4d85b5d9ceeca43f5fef,20170509,100000.0,True,False,False,2017-05-09
4,118,a03223e636434f42ac4c3df47e8bac43,20170804,,False,False,False,2017-08-04


In [6]:
# Get the latest membership date
latest_date = profile['become_member_on_date'].max()
print(latest_date)

2018-07-26 00:00:00


In [7]:
# Apply the function to create a new column with list of dates
profile['days_since_last_member'] = latest_date - profile['become_member_on_date']
profile.head()

Unnamed: 0,age,id,became_member_on,income,gender_F,gender_M,gender_O,become_member_on_date,days_since_last_member
0,118,68be06ca386d4c31939f3a4f0e3dd783,20170212,,False,False,False,2017-02-12,529 days
1,55,0610b486422d4921ae7d2bf64640c50b,20170715,112000.0,True,False,False,2017-07-15,376 days
2,118,38fe809add3b4fcf9315a9694bb96ff5,20180712,,False,False,False,2018-07-12,14 days
3,75,78afa995795e4d85b5d9ceeca43f5fef,20170509,100000.0,True,False,False,2017-05-09,443 days
4,118,a03223e636434f42ac4c3df47e8bac43,20170804,,False,False,False,2017-08-04,356 days


## Portfolio

In [8]:
portfolio = pd.read_json('../data/portfolio.json', orient='records', lines=True)
portfolio.head()

Unnamed: 0,reward,channels,difficulty,duration,offer_type,id
0,10,"[email, mobile, social]",10,7,bogo,ae264e3637204a6fb9bb56bc8210ddfd
1,10,"[web, email, mobile, social]",10,5,bogo,4d5c57ea9a6940dd891ad53e9dbe8da0
2,0,"[web, email, mobile]",0,4,informational,3f207df678b143eea3cee63160fa8bed
3,5,"[web, email, mobile]",5,7,bogo,9b98b8c7a33c4b65b9aebfe6a799e6d9
4,5,"[web, email]",20,10,discount,0b1e1539f2cc45b7b9fa7c272da2e1d7


In [9]:
# Convert offer_type to dummies
portfolio = create_dummy_columns(portfolio,"offer_type")
portfolio.head()

Unnamed: 0,reward,channels,difficulty,duration,id,offer_type_bogo,offer_type_discount,offer_type_informational
0,10,"[email, mobile, social]",10,7,ae264e3637204a6fb9bb56bc8210ddfd,True,False,False
1,10,"[web, email, mobile, social]",10,5,4d5c57ea9a6940dd891ad53e9dbe8da0,True,False,False
2,0,"[web, email, mobile]",0,4,3f207df678b143eea3cee63160fa8bed,False,False,True
3,5,"[web, email, mobile]",5,7,9b98b8c7a33c4b65b9aebfe6a799e6d9,True,False,False
4,5,"[web, email]",20,10,0b1e1539f2cc45b7b9fa7c272da2e1d7,False,True,False


In [10]:
# First, explode the array column to get one row per channel
channels_dummies = pd.get_dummies(portfolio['channels'].explode(), prefix='channel')

# Then aggregate back to get one row per original record
channels_dummies = channels_dummies.groupby(level=0).max()

# Join the dummy columns back to the original dataframe
portfolio = portfolio.join(channels_dummies)

portfolio.head()

Unnamed: 0,reward,channels,difficulty,duration,id,offer_type_bogo,offer_type_discount,offer_type_informational,channel_email,channel_mobile,channel_social,channel_web
0,10,"[email, mobile, social]",10,7,ae264e3637204a6fb9bb56bc8210ddfd,True,False,False,True,True,True,False
1,10,"[web, email, mobile, social]",10,5,4d5c57ea9a6940dd891ad53e9dbe8da0,True,False,False,True,True,True,True
2,0,"[web, email, mobile]",0,4,3f207df678b143eea3cee63160fa8bed,False,False,True,True,True,False,True
3,5,"[web, email, mobile]",5,7,9b98b8c7a33c4b65b9aebfe6a799e6d9,True,False,False,True,True,False,True
4,5,"[web, email]",20,10,0b1e1539f2cc45b7b9fa7c272da2e1d7,False,True,False,True,False,False,True


## Transcript

In [12]:
transcript = pd.read_json('../data/transcript.json', orient='records', lines=True)
transcript.head()

Unnamed: 0,person,event,value,time
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,{'offer id': '9b98b8c7a33c4b65b9aebfe6a799e6d9'},0
1,a03223e636434f42ac4c3df47e8bac43,offer received,{'offer id': '0b1e1539f2cc45b7b9fa7c272da2e1d7'},0
2,e2127556f4f64592b11af22de27a7932,offer received,{'offer id': '2906b810c7d4411798c6938adc9daaa5'},0
3,8ec6ce2a7e7949b1bf142def7d0e0586,offer received,{'offer id': 'fafdcd668e3743c1bb461111dcafc2a4'},0
4,68617ca6246f4fbc85e91a2a49552598,offer received,{'offer id': '4d5c57ea9a6940dd891ad53e9dbe8da0'},0


In [13]:
# Replace the current unnesting code with this:
def extract_value_fields(row):
    if isinstance(row['value'], dict):
        return pd.Series({
            'offer_id': row['value'].get('offer id', None),
            'amount': row['value'].get('amount', None),
            'reward': row['value'].get('reward', None)
        })
    return pd.Series({'offer_id': None, 'amount': None, 'reward': None})

# Create new columns by applying the function
value_columns = transcript.apply(extract_value_fields, axis=1)
transcript = pd.concat([transcript, value_columns], axis=1)

# Display the first few rows of the transformed dataset
transcript.head()

Unnamed: 0,person,event,time,offer_id,amount,reward
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,0,9b98b8c7a33c4b65b9aebfe6a799e6d9,,
1,a03223e636434f42ac4c3df47e8bac43,offer received,0,0b1e1539f2cc45b7b9fa7c272da2e1d7,,
2,e2127556f4f64592b11af22de27a7932,offer received,0,2906b810c7d4411798c6938adc9daaa5,,
3,8ec6ce2a7e7949b1bf142def7d0e0586,offer received,0,fafdcd668e3743c1bb461111dcafc2a4,,
4,68617ca6246f4fbc85e91a2a49552598,offer received,0,4d5c57ea9a6940dd891ad53e9dbe8da0,,


## User Behaviour

In [11]:
user_behaviour = pd.read_csv('../data/features/user_behaviour.csv')
user_behaviour.head()

Unnamed: 0,profile_id,event,time,portfolio_id,amount,reward_x,reward_y,channels,difficulty,duration,offer_type,gender,age,became_member_on,income
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,0,9b98b8c7a33c4b65b9aebfe6a799e6d9,,,5.0,"['web', 'email', 'mobile']",5.0,7.0,bogo,F,75,20170509,100000.0
1,a03223e636434f42ac4c3df47e8bac43,offer received,0,0b1e1539f2cc45b7b9fa7c272da2e1d7,,,5.0,"['web', 'email']",20.0,10.0,discount,,118,20170804,
2,e2127556f4f64592b11af22de27a7932,offer received,0,2906b810c7d4411798c6938adc9daaa5,,,2.0,"['web', 'email', 'mobile']",10.0,7.0,discount,M,68,20180426,70000.0
3,8ec6ce2a7e7949b1bf142def7d0e0586,offer received,0,fafdcd668e3743c1bb461111dcafc2a4,,,2.0,"['web', 'email', 'mobile', 'social']",10.0,10.0,discount,,118,20170925,
4,68617ca6246f4fbc85e91a2a49552598,offer received,0,4d5c57ea9a6940dd891ad53e9dbe8da0,,,10.0,"['web', 'email', 'mobile', 'social']",10.0,5.0,bogo,,118,20171002,
