## Importing all the Data Files

In [2]:
import pandas as pd
import numpy as np

### User Transactions Data 

In [3]:
transactions = pd.read_csv('Dataset/transactions_v2.csv')

In [4]:
transactions.head()

Unnamed: 0,msno,payment_method_id,payment_plan_days,plan_list_price,actual_amount_paid,is_auto_renew,transaction_date,membership_expire_date,is_cancel
0,++6eU4LsQ3UQ20ILS7d99XK8WbiVgbyYL4FUgzZR134=,32,90,298,298,0,20170131,20170504,0
1,++lvGPJOinuin/8esghpnqdljm6NXS8m8Zwchc7gOeA=,41,30,149,149,1,20150809,20190412,0
2,+/GXNtXWQVfKrEDqYAzcSw2xSPYMKWNj22m+5XkVQZc=,36,30,180,180,1,20170303,20170422,0
3,+/w1UrZwyka4C9oNH3+Q8fUf3fD8R3EwWrx57ODIsqk=,36,30,180,180,1,20170329,20170331,1
4,+00PGzKTYqtnb65mPKPyeHXcZEwqiEzktpQksaaSC3c=,41,30,99,99,1,20170323,20170423,0


In [5]:
transactions.shape

(1431009, 9)

In [6]:
transactions['msno'].unique().shape

(1197050,)

### User Churn Data 

In [7]:
train = pd.read_csv('Dataset/train_v2.csv')

In [8]:
train.shape

(970960, 2)

In [9]:
train.head()

Unnamed: 0,msno,is_churn
0,ugx0CjOMzazClkFzU2xasmDZaoIqOUAZPsH1q0teWCg=,1
1,f/NmvEzHfhINFEYZTR05prUdr+E+3+oewvweYz9cCQE=,1
2,zLo9f73nGGT1p21ltZC3ChiRnAVvgibMyazbCxvWPcg=,1
3,8iF/+8HY8lJKFrTc7iR9ZYGCG2Ecrogbc2Vy5YhsfhQ=,1
4,K6fja4+jmoZ5xG6BypqX80Uw/XKpMgrEMdG2edFOxnA=,1


In [10]:
train['is_churn'].value_counts()

0    883630
1     87330
Name: is_churn, dtype: int64

In [11]:
from  sklearn.model_selection  import  train_test_split 
X = train.drop('is_churn',axis=1)
y= train['is_churn']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0,stratify=y)

In [13]:
y_test.value_counts()

0    88363
1     8733
Name: is_churn, dtype: int64

In [14]:
train = X_test

In [15]:
train['is_churn']=y_test

In [16]:
train

Unnamed: 0,msno,is_churn
659299,PR5OUJD4mPxUtFbxFafhbsfEFtx3yyLJvILMFdAQBDY=,0
159297,zcWIOjtZewyhPBiEBH/Y14cJHf6Z0pVtHmNxE5RdTWg=,0
116089,OH1thT0rHD9tuOTZpjTf+/V9u+PTLLByBZgLlDKTuxI=,0
79017,ZI0vEMRlsRYpIAfpN5hxfQ5iI7ildveQ/YE0LeXHWOs=,0
342223,UJ7jSk2WwdGroS+c9BjalClWG8eoi3QluastOanZLCQ=,0
...,...,...
16015,pJGX4+E0QBsEjd7F4saUgIo4AHEyubK4ioyjNehzVBA=,1
160,04Mn5oun4VJEKGLCfYpCThMsAK7rgr6vqdaVHjWIc3o=,1
486427,efyRqPeug9RsS3D7KnUpHLj5Fb2XZppycF1PEn0t2I0=,0
251314,94ML690IJssyphxrJx9aoSt8l+jR7vmutbWzMqk04SY=,0


In [17]:
train.shape

(97096, 2)

In [18]:
train['is_churn'].value_counts()

0    88363
1     8733
Name: is_churn, dtype: int64

### User Logs Data

In [19]:
logs = pd.read_csv('Dataset/user_logs_v2.csv')

In [20]:
logs.shape

(18396362, 9)

### Merging User Churn and User Transactions Data 

In [21]:
from sklearn.preprocessing import LabelEncoder #encoder package of sklearn
le = LabelEncoder() #le variable has been assigned a labelencoder function

transactions['msno'] = le.fit_transform(transactions['msno'].astype(str))
train['msno'] = le.fit_transform(train['msno'].astype(str))
logs['msno'] = le.fit_transform(logs['msno'].astype(str))

In [22]:
trial_join = pd.merge(train, transactions, on='msno', how='inner')

In [23]:
trial_join.shape

(115391, 10)

In [24]:
len(trial_join['msno'].unique())

97096

trial_join.to_csv('transformed_user_summary_ltv.csv',index=False)

### Merging User Churn and User Logs Data

In [25]:
logs_join = pd.merge(train, logs, on='msno', how='inner')

In [26]:
logs_join

Unnamed: 0,msno,is_churn,date,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs
0,41633,0,20170319,0,0,0,0,105,32,26355.191
1,41633,0,20170314,22,22,5,5,9,55,5849.439
2,41633,0,20170320,13,1,0,5,68,44,18241.627
3,41633,0,20170326,17,1,0,3,16,23,4947.604
4,41633,0,20170328,8,1,0,1,23,18,6405.278
...,...,...,...,...,...,...,...,...,...,...
1620180,16742,0,20170313,1,0,0,0,21,20,7293.067
1620181,16742,0,20170327,2,0,1,1,4,7,1172.677
1620182,16742,0,20170308,2,0,1,2,20,23,9140.741
1620183,16742,0,20170312,7,1,2,1,15,19,6975.761


In [27]:
len(logs_join['msno'].unique())

97096

## Creating a new Dataframe with Summarized Information of each User 

In [28]:
df = trial_join

In [30]:
from datetime import datetime, date

In [31]:
df['transaction_date'] = df['transaction_date'].astype(str)        

In [32]:
df['day'] = df['transaction_date'].str.slice(6,8,1)
df['month'] = df['transaction_date'].str.slice(4,6,1)
df['year'] = df['transaction_date'].str.slice(0,4,1)
df['trans_date'] = pd.to_datetime(df[['day', 'month', 'year']])

In [34]:
df=df.drop(['day','year','month'],axis=1)

In [43]:
import datetime as dft
df_6m = df[(df.trans_date >= dft.datetime(2017,1,1)) & (df.trans_date < dft.datetime(2017,3,31))].reset_index(drop=True)

In [44]:
df_6m.shape

(84076, 11)

In [45]:
df_3m = df[(df.trans_date >= dft.datetime(2016,7,1)) & (df.trans_date < dft.datetime(2016,12,31))].reset_index(drop=True)

In [46]:
df_3m.shape

(6791, 11)

### Creating Monetary Column 

In [None]:
df_6m

In [32]:
just_revenue = df.groupby("msno")["actual_amount_paid"].sum()

In [33]:
just_revenue.unique()

array([1599,   99,  298,  149, 1788,  894,  129,  180,  100,  198, 2988,
        150,  536, 2400, 1380,  258,  480,  398,  477, 5511, 3326,  699,
       1200, 1937, 1299,  119,  360,    0,  447,  685, 2831,  930,  300,
        397, 2235,  495,  745, 1043, 3930,  799,  540,  387,  199, 3138,
        600, 2931,  645, 2384,  596,  297, 1860, 1490,  929, 1399, 1485,
        248,  594, 2549, 1072,  329,  834, 1693, 2499, 1639, 1341,  278,
        725, 4188,  358, 3576,  229, 1000,  500, 3078, 1192,  134, 1985,
       1984,  720, 4895,  585,  427,  200, 2595,  299, 1887, 5165, 1470,
       2859, 2265,  249, 2493, 1023, 2598,  228, 1487, 2334, 2631, 2036,
        629,  450,  954,  279, 6057,  396,  848,  665, 1500,  595, 1112,
       2999,  847, 1260,  357, 2682,  327, 3951, 1142,  874,  478,  378,
       2980, 4865, 3129,  695,  546,  792, 3387,  347, 2323, 1638, 1254,
        844, 1908, 1619,  496, 1431, 4141,  993,  547, 2847,  120,  517,
       5243, 2000, 1448, 2484, 1897, 2086, 2088,  6

In [34]:
just_revenue

msno
0        1599
1          99
2         298
3         149
4         149
         ... 
97091     149
97092      99
97093      99
97094     298
97095     149
Name: actual_amount_paid, Length: 97096, dtype: int64

### Creating Frequency Column 

In [35]:
just_count_subs = df.groupby("msno")["payment_plan_days"].sum()

In [36]:
just_count_subs

msno
0        395
1         30
2         60
3         30
4         30
        ... 
97091     30
97092     30
97093     30
97094     60
97095     30
Name: payment_plan_days, Length: 97096, dtype: int64

### Creating Recency Column 

In [40]:
import datetime as dft
a = dft.datetime(2017,4,1)

In [41]:
df['Days_Diff'] = ( a - df['trans_date']).dt.days

In [42]:
df['trans_date'].max()

Timestamp('2017-03-31 00:00:00')

In [43]:
df['trans_date'].min()

Timestamp('2015-01-01 00:00:00')

### Creating a new User RFM DataFrame and adding 3 columns - Recency, Frequency and Monetary 

In [44]:
just_recency = df.groupby("msno")["Days_Diff"].min()

In [45]:
user_rfm_1 = pd.merge(just_revenue, just_recency, on='msno', how='inner')

In [46]:
user_rfm = pd.merge(user_rfm_1, just_count_subs, on='msno', how='inner')

In [47]:
user_rfm.rename(columns = {'actual_amount_paid':'Monetary', 'Days_Diff':'Recency','payment_plan_days':'Frequency'}, inplace = True)

In [48]:
user_rfm

Unnamed: 0_level_0,Monetary,Recency,Frequency
msno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1599,160,395
1,99,17,30
2,298,1,60
3,149,6,30
4,149,17,30
...,...,...,...
97091,149,1,30
97092,99,28,30
97093,99,22,30
97094,298,1,60


### Creating is_autoRenew column 

In [49]:
auto_renew = df.groupby("msno")["is_auto_renew"].max()

In [50]:
auto_renew.value_counts()

1    75030
0    22066
Name: is_auto_renew, dtype: int64

In [51]:
user_summary = pd.merge(user_rfm, auto_renew, on='msno', how='inner')

In [52]:
user_summary

Unnamed: 0_level_0,Monetary,Recency,Frequency,is_auto_renew
msno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1599,160,395,0
1,99,17,30,1
2,298,1,60,1
3,149,6,30,1
4,149,17,30,1
...,...,...,...,...
97091,149,1,30,1
97092,99,28,30,1
97093,99,22,30,1
97094,298,1,60,1


### Creating is_discount column

In [53]:
df['discount']=df['plan_list_price']-df['actual_amount_paid']

In [54]:
is_dis = df.groupby("msno")["discount"].sum()

In [55]:
user_summary = pd.merge(user_summary, is_dis, on='msno', how='inner')

In [56]:
user_summary

Unnamed: 0_level_0,Monetary,Recency,Frequency,is_auto_renew,discount
msno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1599,160,395,0,0
1,99,17,30,1,0
2,298,1,60,1,0
3,149,6,30,1,0
4,149,17,30,1,0
...,...,...,...,...,...
97091,149,1,30,1,0
97092,99,28,30,1,0
97093,99,22,30,1,0
97094,298,1,60,1,0


In [57]:
user_summary['is_discount'] = 0
user_summary.loc[user_summary['discount']>0,'is_discount'] = 1

In [58]:
user_summary.drop(['discount'], axis=1)

Unnamed: 0_level_0,Monetary,Recency,Frequency,is_auto_renew,is_discount
msno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1599,160,395,0,0
1,99,17,30,1,0
2,298,1,60,1,0
3,149,6,30,1,0
4,149,17,30,1,0
...,...,...,...,...,...
97091,149,1,30,1,0
97092,99,28,30,1,0
97093,99,22,30,1,0
97094,298,1,60,1,0


In [59]:
user_summary

Unnamed: 0_level_0,Monetary,Recency,Frequency,is_auto_renew,discount,is_discount
msno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1599,160,395,0,0,0
1,99,17,30,1,0,0
2,298,1,60,1,0,0
3,149,6,30,1,0,0
4,149,17,30,1,0,0
...,...,...,...,...,...,...
97091,149,1,30,1,0,0
97092,99,28,30,1,0,0
97093,99,22,30,1,0,0
97094,298,1,60,1,0,0


### Creating Preffered Plan Column 

In [60]:
best_plan = df.groupby("msno")["payment_plan_days"].agg(lambda x:x.value_counts().index[0])

In [61]:
best_plan.value_counts()

30     82032
410     6457
195     2181
180     1782
90      1306
395      812
7        360
100      344
415      282
365      272
200      238
360      209
120      186
240      139
60       136
450      135
400      126
270       60
1         25
10         6
14         4
80         2
0          1
45         1
Name: payment_plan_days, dtype: int64

In [62]:
user_summary = pd.merge(user_summary, best_plan, on='msno', how='inner')

In [63]:
user_summary.rename(columns={'payment_plan_days':'BestPlan'}, inplace = True)

In [64]:
user_summary = user_summary.drop(['discount'], axis=1)

In [65]:
user_summary

Unnamed: 0_level_0,Monetary,Recency,Frequency,is_auto_renew,is_discount,BestPlan
msno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1599,160,395,0,0,395
1,99,17,30,1,0,30
2,298,1,60,1,0,30
3,149,6,30,1,0,30
4,149,17,30,1,0,30
...,...,...,...,...,...,...
97091,149,1,30,1,0,30
97092,99,28,30,1,0,30
97093,99,22,30,1,0,30
97094,298,1,60,1,0,30


### Creating Total Active Days Per Month Column

In [66]:
df2 = logs_join

In [67]:
total_active_days_per_month = df2.groupby("msno")["total_secs"].count()

In [68]:
total_active_days_per_month.value_counts()

1     7656
31    7422
30    5084
2     4263
29    4220
28    3865
27    3540
3     3505
4     3356
26    3287
25    3055
24    2896
23    2763
5     2697
22    2681
21    2578
19    2544
20    2535
18    2367
17    2329
15    2274
9     2270
6     2267
7     2244
8     2226
13    2224
14    2217
12    2214
11    2210
16    2166
10    2141
Name: total_secs, dtype: int64

In [69]:
user_summary = pd.merge(user_summary, total_active_days_per_month, on='msno', how='inner')

In [70]:
user_summary.rename(columns={'total_secs':'active_days_per_month'}, inplace = True)

In [71]:
user_summary

Unnamed: 0_level_0,Monetary,Recency,Frequency,is_auto_renew,is_discount,BestPlan,active_days_per_month
msno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1599,160,395,0,0,395,26
1,99,17,30,1,0,30,31
2,298,1,60,1,0,30,28
3,149,6,30,1,0,30,21
4,149,17,30,1,0,30,29
...,...,...,...,...,...,...,...
97091,149,1,30,1,0,30,10
97092,99,28,30,1,0,30,1
97093,99,22,30,1,0,30,12
97094,298,1,60,1,0,30,20


### Creating Total_Usage_Time_Per_Month Column

In [72]:
total_usage_time_per_month = df2.groupby("msno")["total_secs"].sum()

In [73]:
total_usage_time_per_month

msno
0        117907.425
1        192527.892
2        115411.260
3        149896.558
4        116433.247
            ...    
97091     75487.962
97092       602.097
97093     51808.482
97094    162554.908
97095    344383.640
Name: total_secs, Length: 97096, dtype: float64

In [74]:
user_summary = pd.merge(user_summary, total_usage_time_per_month, on='msno', how='inner')

In [75]:
user_summary.rename(columns={'total_secs':'total_usage_time_per_month'}, inplace = True)

In [76]:
user_summary

Unnamed: 0_level_0,Monetary,Recency,Frequency,is_auto_renew,is_discount,BestPlan,active_days_per_month,total_usage_time_per_month
msno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,1599,160,395,0,0,395,26,117907.425
1,99,17,30,1,0,30,31,192527.892
2,298,1,60,1,0,30,28,115411.260
3,149,6,30,1,0,30,21,149896.558
4,149,17,30,1,0,30,29,116433.247
...,...,...,...,...,...,...,...,...
97091,149,1,30,1,0,30,10,75487.962
97092,99,28,30,1,0,30,1,602.097
97093,99,22,30,1,0,30,12,51808.482
97094,298,1,60,1,0,30,20,162554.908


### Creating Total Unique Songs Played Per Month Column

In [77]:
total_unique_songs_played_per_month = df2.groupby("msno")["num_unq"].sum()

In [78]:
total_unique_songs_played_per_month

msno
0         530
1         885
2         468
3         828
4         230
         ... 
97091     356
97092       5
97093     123
97094     640
97095    1252
Name: num_unq, Length: 97096, dtype: int64

In [79]:
user_summary = pd.merge(user_summary, total_unique_songs_played_per_month, on='msno', how='inner')

In [80]:
user_summary.rename(columns={'num_unq':'total_unique_songs_played_per_month'}, inplace = True)

In [81]:
user_summary

Unnamed: 0_level_0,Monetary,Recency,Frequency,is_auto_renew,is_discount,BestPlan,active_days_per_month,total_usage_time_per_month,total_unique_songs_played_per_month
msno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,1599,160,395,0,0,395,26,117907.425,530
1,99,17,30,1,0,30,31,192527.892,885
2,298,1,60,1,0,30,28,115411.260,468
3,149,6,30,1,0,30,21,149896.558,828
4,149,17,30,1,0,30,29,116433.247,230
...,...,...,...,...,...,...,...,...,...
97091,149,1,30,1,0,30,10,75487.962,356
97092,99,28,30,1,0,30,1,602.097,5
97093,99,22,30,1,0,30,12,51808.482,123
97094,298,1,60,1,0,30,20,162554.908,640


### Creating Columns related to length of songs played

In [82]:
df2['percent_25'] = df2['num_25']/(df2['num_25']+df2['num_50']+df2['num_75']+df2['num_985']+df2['num_100'])
df2['percent_50'] = df2['num_50']/(df2['num_25']+df2['num_50']+df2['num_75']+df2['num_985']+df2['num_100'])
df2['percent_100'] = (df2['num_985']+df2['num_100'])/(df2['num_25']+df2['num_50']+df2['num_75']+df2['num_985']+df2['num_100'])

In [83]:
df2.head()

Unnamed: 0,msno,is_churn,date,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs,percent_25,percent_50,percent_100
0,41633,0,20170319,0,0,0,0,105,32,26355.191,0.0,0.0,1.0
1,41633,0,20170314,22,22,5,5,9,55,5849.439,0.349206,0.349206,0.222222
2,41633,0,20170320,13,1,0,5,68,44,18241.627,0.149425,0.011494,0.83908
3,41633,0,20170326,17,1,0,3,16,23,4947.604,0.459459,0.027027,0.513514
4,41633,0,20170328,8,1,0,1,23,18,6405.278,0.242424,0.030303,0.727273


In [84]:
Percent_Songs_Played_25 = df2.groupby("msno")["percent_25"].mean()*100
Percent_Songs_Played_50 = df2.groupby("msno")["percent_50"].mean()*100
Percent_Songs_Played_100 = df2.groupby("msno")["percent_100"].mean()*100

In [85]:
user_summary = pd.merge(user_summary, Percent_Songs_Played_25, on='msno', how='inner')
user_summary = pd.merge(user_summary, Percent_Songs_Played_50, on='msno', how='inner')
user_summary = pd.merge(user_summary, Percent_Songs_Played_100, on='msno', how='inner')

In [86]:
user_summary.rename(columns={'percent_25':'Percent_Songs_Played_25', 'percent_50':'Percent_Songs_Played_50', 'percent_100':'Percent_Songs_Played_100'}, inplace = True)


In [87]:
user_summary

Unnamed: 0_level_0,Monetary,Recency,Frequency,is_auto_renew,is_discount,BestPlan,active_days_per_month,total_usage_time_per_month,total_unique_songs_played_per_month,Percent_Songs_Played_25,Percent_Songs_Played_50,Percent_Songs_Played_100
msno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,1599,160,395,0,0,395,26,117907.425,530,20.177022,3.482550,74.636630
1,99,17,30,1,0,30,31,192527.892,885,17.864410,8.453724,67.017440
2,298,1,60,1,0,30,28,115411.260,468,7.021077,3.338308,87.340876
3,149,6,30,1,0,30,21,149896.558,828,23.449066,12.609433,54.993834
4,149,17,30,1,0,30,29,116433.247,230,14.380372,3.943355,77.144280
...,...,...,...,...,...,...,...,...,...,...,...,...
97091,149,1,30,1,0,30,10,75487.962,356,28.317247,8.417237,60.541827
97092,99,28,30,1,0,30,1,602.097,5,20.000000,20.000000,20.000000
97093,99,22,30,1,0,30,12,51808.482,123,10.449385,4.037356,83.239187
97094,298,1,60,1,0,30,20,162554.908,640,6.277635,1.676004,91.130939


In [88]:
user_summary.shape

(97096, 12)

In [89]:
user_summary.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 97096 entries, 0 to 97095
Data columns (total 12 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Monetary                             97096 non-null  int64  
 1   Recency                              97096 non-null  int64  
 2   Frequency                            97096 non-null  int64  
 3   is_auto_renew                        97096 non-null  int64  
 4   is_discount                          97096 non-null  int64  
 5   BestPlan                             97096 non-null  int64  
 6   active_days_per_month                97096 non-null  int64  
 7   total_usage_time_per_month           97096 non-null  float64
 8   total_unique_songs_played_per_month  97096 non-null  int64  
 9   Percent_Songs_Played_25              97096 non-null  float64
 10  Percent_Songs_Played_50              97096 non-null  float64
 11  Percent_Songs_Played_100    

### Adding is_churn column 

In [90]:
user_summary['is_churn'] = df2['is_churn']

In [91]:
user_summary

Unnamed: 0_level_0,Monetary,Recency,Frequency,is_auto_renew,is_discount,BestPlan,active_days_per_month,total_usage_time_per_month,total_unique_songs_played_per_month,Percent_Songs_Played_25,Percent_Songs_Played_50,Percent_Songs_Played_100,is_churn
msno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,1599,160,395,0,0,395,26,117907.425,530,20.177022,3.482550,74.636630,0
1,99,17,30,1,0,30,31,192527.892,885,17.864410,8.453724,67.017440,0
2,298,1,60,1,0,30,28,115411.260,468,7.021077,3.338308,87.340876,0
3,149,6,30,1,0,30,21,149896.558,828,23.449066,12.609433,54.993834,0
4,149,17,30,1,0,30,29,116433.247,230,14.380372,3.943355,77.144280,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
97091,149,1,30,1,0,30,10,75487.962,356,28.317247,8.417237,60.541827,0
97092,99,28,30,1,0,30,1,602.097,5,20.000000,20.000000,20.000000,0
97093,99,22,30,1,0,30,12,51808.482,123,10.449385,4.037356,83.239187,0
97094,298,1,60,1,0,30,20,162554.908,640,6.277635,1.676004,91.130939,0


In [92]:
user_summary.reset_index(level=0, inplace=True)

In [93]:
user_summary

Unnamed: 0,msno,Monetary,Recency,Frequency,is_auto_renew,is_discount,BestPlan,active_days_per_month,total_usage_time_per_month,total_unique_songs_played_per_month,Percent_Songs_Played_25,Percent_Songs_Played_50,Percent_Songs_Played_100,is_churn
0,0,1599,160,395,0,0,395,26,117907.425,530,20.177022,3.482550,74.636630,0
1,1,99,17,30,1,0,30,31,192527.892,885,17.864410,8.453724,67.017440,0
2,2,298,1,60,1,0,30,28,115411.260,468,7.021077,3.338308,87.340876,0
3,3,149,6,30,1,0,30,21,149896.558,828,23.449066,12.609433,54.993834,0
4,4,149,17,30,1,0,30,29,116433.247,230,14.380372,3.943355,77.144280,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97091,97091,149,1,30,1,0,30,10,75487.962,356,28.317247,8.417237,60.541827,0
97092,97092,99,28,30,1,0,30,1,602.097,5,20.000000,20.000000,20.000000,0
97093,97093,99,22,30,1,0,30,12,51808.482,123,10.449385,4.037356,83.239187,0
97094,97094,298,1,60,1,0,30,20,162554.908,640,6.277635,1.676004,91.130939,0


In [94]:
user_summary.shape

(97096, 14)

## Exporting the Cleaned Dataset as CSV for next steps 

In [96]:
# saving the dataframe to csv 
user_summary.to_csv('TransformedDataset/transformed_user_summary.csv',index=False)

In [98]:
# Read saved file and display  top 5 rows 
user_summary_file_saved = pd.read_csv('TransformedDataset/transformed_user_summary.csv')
user_summary_file_saved.head(n=5)

Unnamed: 0,msno,Monetary,Recency,Frequency,is_auto_renew,is_discount,BestPlan,active_days_per_month,total_usage_time_per_month,total_unique_songs_played_per_month,Percent_Songs_Played_25,Percent_Songs_Played_50,Percent_Songs_Played_100,is_churn
0,0,1599,160,395,0,0,395,26,117907.425,530,20.177022,3.48255,74.63663,0
1,1,99,17,30,1,0,30,31,192527.892,885,17.86441,8.453724,67.01744,0
2,2,298,1,60,1,0,30,28,115411.26,468,7.021077,3.338308,87.340876,0
3,3,149,6,30,1,0,30,21,149896.558,828,23.449066,12.609433,54.993834,0
4,4,149,17,30,1,0,30,29,116433.247,230,14.380372,3.943355,77.14428,0


In [99]:
user_summary_file_saved.shape

(97096, 14)

# FOR LTV 

In [107]:
user_summary_ltv = pd.merge(user_summary, df_6m, on='msno', how='inner')

NameError: name 'df_6m' is not defined

In [None]:
user_summary_ltv.head()

In [None]:
revenue_6m = df_6m.groupby("msno")["actual_amount_paid"].sum()

In [None]:
revenue_6m.shape

In [None]:
user_summary_ltv = pd.merge(user_summary, revenue_6m, on='msno', how='inner')

In [None]:
user_summary_ltv.shape