## Modules

In [1]:
import pandas as pd
import numpy as np
import warnings
import datetime
warnings.filterwarnings("ignore")

## Fonctions

In [2]:
def calculate_age(start):
    
    """ Calcul de l'age du client grace à l'information sur la date de naissance """
    
    start = datetime.datetime.strptime(start, "%d-%m-%Y").date()
    today = datetime.date.today()
    return today.year - start.year - ((today.month, today.day) < (start.month, start.day))

In [3]:
def calculate_life(start):
    
    """Calcul de nombre de mois de survie de chaque clients  """

    REF = datetime.datetime.strptime("28-02-2014", "%d-%m-%Y").date()
    return (REF.year - start.year)*12 - (REF.month < start.month)

## Load Data

In [4]:
sales_df = pd.read_pickle('intermed/sales')
sales_df['tran_date'] = pd.to_datetime(sales_df["tran_date"].str.replace('/','-'), format = '%d-%m-%Y')

# supprimer des lignes avec des prix negatifs
sales_df= sales_df[sales_df ['Qty'] >= 0]
sales_df = sales_df[sales_df['total_amt'] > 0]

sales_df.head()

Unnamed: 0,customer_Id,DOB,Gender,city_code,prod_cat_code,prod_cat,prod_sub_cat_code,prod_subcat,transaction_id,cust_id,tran_date,prod_subcat_code,Qty,Rate,Tax,total_amt,Store_type
0,268408,02-01-1970,M,4.0,1,Clothing,1,Women,63314547725,268408,2012-04-26,1,1,806,84.63,890.63,Flagship store
1,268408,02-01-1970,M,4.0,2,Footwear,1,Mens,56844530655,268408,2012-10-14,1,5,1175,616.875,6491.875,MBR
2,268408,02-01-1970,M,4.0,2,Footwear,4,Kids,64633435931,268408,2013-05-06,4,3,312,98.28,1034.28,Flagship store
3,268408,02-01-1970,M,4.0,3,Electronics,9,Cameras,56902862040,268408,2013-01-30,9,5,868,455.7,4795.7,e-Shop
4,268408,02-01-1970,M,4.0,3,Electronics,10,Audio and video,19516063887,268408,2012-09-07,10,4,650,273.0,2873.0,e-Shop


In [5]:
rfm = pd.read_pickle('intermed/rfm.sav')
rfm

Unnamed: 0,customer_Id,frequency,recency,T,monetary_value,R,F,M,RFM_Segment,Score,RFM_Score,Segment
0,266783,3.0,516.0,881.0,1037.963333,2,2,1,221,Green,5,Hibernating
1,266784,2.0,103.0,546.0,707.200000,1,2,1,121,Green,4,Hibernating
2,266785,6.0,870.0,1073.0,3588.487500,4,4,4,444,Platinum,12,Champions
3,266788,3.0,611.0,984.0,1574.993333,2,2,2,222,Bronze,6,Hibernating
4,266794,9.0,1062.0,1070.0,2839.113333,4,4,3,443,Platinum,11,Champions
...,...,...,...,...,...,...,...,...,...,...,...,...
5501,275257,3.0,637.0,808.0,4097.340000,3,2,4,324,Silver,9,Potential loyalists
5502,275261,2.0,740.0,878.0,1649.765000,3,2,2,322,Bronze,7,Potential loyalists
5503,275262,1.0,200.0,922.0,3328.260000,1,1,4,114,Bronze,6,Hibernating
5504,275264,1.0,64.0,930.0,2594.540000,1,1,3,113,Green,5,Hibernating


## Featues engineeing

### Dummify prod cat

In [6]:
sales_df.groupby('customer_Id').\
    agg({
        "total_amt": "mean",
        "prod_cat": lambda x: ','.join(set(x))
    })

Unnamed: 0_level_0,total_amt,prod_cat
customer_Id,Unnamed: 1_level_1,Unnamed: 2_level_1
266783,2238.177500,"Footwear,Books,Clothing"
266784,1898.021667,"Books,Electronics"
266785,4063.242857,"Books,Home and kitchen,Bags,Footwear"
266788,1523.242500,"Books,Bags,Footwear"
266794,2556.166364,"Footwear,Electronics,Books,Bags,Clothing"
...,...,...
275257,3431.301250,"Books,Home and kitchen,Electronics,Footwear"
275261,1235.021667,"Books,Bags"
275262,2539.290000,"Electronics,Clothing"
275264,1907.782500,"Books,Home and kitchen"


In [7]:
DF=sales_df.groupby('customer_Id').\
    agg({
        "total_amt" : "mean",
        "prod_cat": lambda x: ','.join(set(x))
    })

In [8]:
DF

Unnamed: 0_level_0,total_amt,prod_cat
customer_Id,Unnamed: 1_level_1,Unnamed: 2_level_1
266783,2238.177500,"Footwear,Books,Clothing"
266784,1898.021667,"Books,Electronics"
266785,4063.242857,"Books,Home and kitchen,Bags,Footwear"
266788,1523.242500,"Books,Bags,Footwear"
266794,2556.166364,"Footwear,Electronics,Books,Bags,Clothing"
...,...,...
275257,3431.301250,"Books,Home and kitchen,Electronics,Footwear"
275261,1235.021667,"Books,Bags"
275262,2539.290000,"Electronics,Clothing"
275264,1907.782500,"Books,Home and kitchen"


In [9]:
# Hhere tran_date  is the  date of last transaction

colonnes_max=['DOB','Gender', 'tran_date', 'category_Bags', 'category_Books', 'category_Clothing', 'category_Electronics', 'category_Footwear', 'category_Home and kitchen']
# aggregation with max. Foe example :tran_date  is the  date of last transaction
sales_full=sales_df.join(pd.get_dummies(sales_df.prod_cat, prefix='category')).\
    groupby('customer_Id').\
    max()[colonnes_max].\
    reset_index().\
    merge(rfm, on='customer_Id')

In [10]:
sales_full['age'] = sales_full['DOB'].apply(calculate_age)    
sales_full['life'] = sales_full['tran_date'].apply(calculate_life)

### Déduire la variable attrition

Considerer comme un cas d'attrition y=1

+  tous les clients n'ont fait qu'une seule commande  et dont leur dernière achat est inférieur à  juin 2013-06 
+  tous les clients ayant effectuer au moins deux achats mais qui n'ont pas achété depuis une année

In [11]:
#conditions d'attritions
condition1 = (sales_full.frequency < 1) &  ( sales_full.tran_date <= '30-06-2013')  
condition2 = (sales_full.frequency > 0) &  ( sales_full.tran_date <= '01-02-2013')
#création de la variable attrition
sales_full['attrition'] = np.where(condition1 | condition2  , 1 , 0)
sales_full['attrition'].value_counts()

0    4144
1    1362
Name: attrition, dtype: int64

## Export Data

In [12]:
sales_full

Unnamed: 0,customer_Id,DOB,Gender,tran_date,category_Bags,category_Books,category_Clothing,category_Electronics,category_Footwear,category_Home and kitchen,...,R,F,M,RFM_Segment,Score,RFM_Score,Segment,age,life,attrition
0,266783,01-05-1974,M,2013-02-20,0,1,1,0,1,0,...,2,2,1,221,Green,5,Hibernating,47,12,0
1,266784,13-12-1991,F,2012-12-04,0,1,0,1,0,0,...,1,2,1,121,Green,4,Hibernating,30,23,1
2,266785,29-06-1985,F,2013-08-01,1,1,0,0,1,1,...,4,4,4,444,Platinum,12,Champions,36,11,0
3,266788,20-03-1972,F,2013-02-12,1,1,0,0,1,0,...,2,2,2,222,Bronze,6,Hibernating,49,12,0
4,266794,28-02-1971,F,2014-02-12,1,1,1,1,1,0,...,4,4,3,443,Platinum,11,Champions,50,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5501,275257,25-10-1987,M,2013-09-02,0,1,0,1,1,1,...,3,2,4,324,Silver,9,Potential loyalists,34,11,0
5502,275261,21-03-1973,F,2013-10-05,1,1,0,0,0,0,...,3,2,2,322,Bronze,7,Potential loyalists,48,11,0
5503,275262,06-06-1973,M,2012-02-29,0,0,1,1,0,0,...,1,1,4,114,Bronze,6,Hibernating,48,24,1
5504,275264,24-10-1991,M,2011-10-08,0,1,0,0,0,1,...,1,1,3,113,Green,5,Hibernating,30,35,1


sales_full.to_pickle('intermed/table_fo_lifetime.sav')