In [2]:
import pandas as pd
from scipy import stats
import datetime
import rpy2
import seaborn as sns
import numpy as np
import xarray as xr
from kshape.core import kshape, zscore
from tslearn.clustering import KShape
import matplotlib.pyplot as plt

## 0. Preprocessing Alibaba data

In [58]:
df = pd.read_csv("ecs_series_1_uid_encoded(1).csv")
datetime_object = datetime.datetime.strptime("9/27/18 16:00", "%m/%d/%y %H:%M")
df['Date'] = df["ds"].apply(lambda x:datetime.datetime.strptime(x, "%m/%d/%y %H:%M"))
df = df.drop(columns = ['ds','region_no_factor','iz_no_factor','product_factor','instance_type_name_factor','instance_type_family_factor'])

In [59]:
df = df.groupby(by=['uid','Date']).sum().reset_index()
df = df.pivot(columns = 'Date',index = 'uid',values = 'vcpu_net_delead')

## 0.1 Preprocessing cdnow

In [68]:
df = pd.read_csv("CDNOW_master.txt",header=None,names  =['id','Date','disks','price_purchase'],delim_whitespace=True,)

In [69]:
df.sample(5)

Unnamed: 0,id,Date,disks,price_purchase
40415,13287,19970325,1,15.96
30799,9920,19970507,1,14.96
53124,17585,19980507,1,12.49
55813,18614,19980409,20,257.3
2094,616,19980604,1,14.99


In [70]:
df['Date'] = df["Date"].apply(lambda x:datetime.datetime.strptime(str(x),"%Y%m%d"))
df = df.drop(columns = ['disks'])
df = df.groupby(by=['id','Date']).sum().reset_index()
df = df.pivot(columns = 'Date',index = 'id',values = 'price_purchase')
df.sample(2)

Date,1997-01-01 00:00:00,1997-01-02 00:00:00,1997-01-03 00:00:00,1997-01-04 00:00:00,1997-01-05 00:00:00,1997-01-06 00:00:00,1997-01-07 00:00:00,1997-01-08 00:00:00,1997-01-09 00:00:00,1997-01-10 00:00:00,...,1998-06-21 00:00:00,1998-06-22 00:00:00,1998-06-23 00:00:00,1998-06-24 00:00:00,1998-06-25 00:00:00,1998-06-26 00:00:00,1998-06-27 00:00:00,1998-06-28 00:00:00,1998-06-29 00:00:00,1998-06-30 00:00:00
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
19522,,,,,,,,,,,...,,,,,,,,,,
953,,,,15.36,,,,,,,...,,,,,,,,,,
15594,,,,,,,,,,,...,,,,,,,,,,
17811,,,,,,,,,,,...,,,,,,,,,,
1447,,,,,,30.72,,,,,...,,,,,,,,,,
7442,,,,,,,,,,,...,,,,,,,,,,


## 0.9. Taking random 2000 users

In [72]:
df_test = df.sample(2000)
df_test.to_csv('test.csv')

## 1. RFM-analysis

In [73]:
df_recency =  pd.DataFrame(columns = df.columns)
df_monetary =  pd.DataFrame(columns = df.columns)
df_frequency = pd.DataFrame(columns = df.columns)

In [74]:
"""
The dataframe should have such columns: id of consumer, other columns as a time series
"""
for index, row in df_test.iterrows():
    arr = []
    arr = list(row)
    rec_arr = []
    mon_arr = []
    freq_arr = []
    start_date = row.index[0]
    last_purchase = np.nan
    first_purchase = 0 #flag
    total_purchases = 0
    #RECENCY Customer did a explicit activity, such as renting cores or releasing cores.
    for j, entry in enumerate(arr):
        if (np.isnan(entry)):            
            if (first_purchase==0):
                rec_arr.append(row.index[j]-start_date)
            else:
                rec_arr.append(row.index[j]-last_purchase)
        else:
            #not nan
            total_purchases+=1
            first_purchase = 1
            rec_arr.append(0)
            last_purchase = row.index[j]
        #FREQUENCY it can be a cumulative average, the number of events divided by the number of days that have passed
        freq_arr.append(total_purchases/(j+1))
    df_recency.loc[index] = rec_arr
    df_frequency.loc[index]=freq_arr
    #MONETARY  Moving average monetary value
    current_servers = 0
    total_days_of_purchases = 0 
    for j,entry in enumerate(arr):
        if (np.isnan(entry)):
            total_days_of_purchases+=current_servers
        else:
            current_servers+=entry
            total_days_of_purchases+=current_servers
        mon_arr.append(total_days_of_purchases/(j+1))
    df_monetary.loc[index] = mon_arr
df_recency=df_recency.applymap( lambda x:0 if (type(x)==int) else x.days)

## 2. Clustering

In [75]:
ksh_monetary = KShape(n_clusters=4)
df_monetary['label'] = ksh_monetary.fit(df_monetary.values)
ksh_frequency = KShape(n_clusters=4)
df_frequency['label'] = ksh_frequency.fit(df_frequency.values)
ksh_recency = KShape(n_clusters=4)
df_recency['label'] = ksh_recency.fit(df_recency.values)

Resumed because of empty cluster
Resumed because of empty cluster
Resumed because of empty cluster
Resumed because of empty cluster
Resumed because of empty cluster
Resumed because of empty cluster
0.276 --> 0.264 --> 0.262 --> 0.257 --> 0.256 --> 0.256 --> 0.256 --> 0.256 --> 0.256 --> 0.256 --> 0.256 --> 
0.181 --> 0.171 --> 0.168 --> 0.169 --> 
0.196 --> 0.195 --> 0.193 --> 0.191 --> 0.190 --> 0.190 --> 


In [76]:
df_monetary['label'] =ksh_monetary.labels_
df_frequency['label'] =ksh_frequency.labels_
df_recency['label'] =ksh_recency.labels_

In [None]:
df_centroids = pd.DataFrame(columns = df_recency.drop(columns = 'label').columns)
for i in range(ksh_recency.cluster_centers_.shape[0]):
    df_centroids.loc['Centroid_for_recency_'+str(i)]= ksh_recency.cluster_centers_[i].ravel()
    df_centroids.loc['Centroid_for_frequency_'+str(i)]= ksh_frequency.cluster_centers_[i].ravel()
    df_centroids.loc['Centroid_for_monetary_'+str(i)]= ksh_monetary.cluster_centers_[i].ravel()

In [159]:
df_monetary.to_csv("monetary_CDNOW.csv")
df_frequency.to_csv("frequency_CDNOW.csv")
df_recency.to_csv("recency_CDNOW.csv")
df_centroids.to_csv("cluster_centroids_CDNOW.csv")