## Importing the Transformed Data 

In [1]:
# import libraries
from datetime import datetime, timedelta
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from __future__ import division

In [2]:
data = pd.read_csv('TransformedDataset/transformed_user_summary.csv')

In [3]:
data.head()

Unnamed: 0,msno,Monetary,Recency,Frequency,is_auto_renew,is_discount,BestPlan,active_days_per_month,total_usage_time_per_month,total_unique_songs_played_per_month,Percent_Songs_Played_25,Percent_Songs_Played_50,Percent_Songs_Played_100,is_churn
0,0,1599,160,395,0,0,395,26,117907.425,530,20.177022,3.48255,74.63663,0
1,1,99,17,30,1,0,30,31,192527.892,885,17.86441,8.453724,67.01744,0
2,2,298,1,60,1,0,30,28,115411.26,468,7.021077,3.338308,87.340876,0
3,3,149,6,30,1,0,30,21,149896.558,828,23.449066,12.609433,54.993834,0
4,4,149,17,30,1,0,30,29,116433.247,230,14.380372,3.943355,77.14428,0


In [4]:
data.isna().sum()

msno                                   0
Monetary                               0
Recency                                0
Frequency                              0
is_auto_renew                          0
is_discount                            0
BestPlan                               0
active_days_per_month                  0
total_usage_time_per_month             0
total_unique_songs_played_per_month    0
Percent_Songs_Played_25                0
Percent_Songs_Played_50                0
Percent_Songs_Played_100               0
is_churn                               0
dtype: int64

## Creating Cluster Columns for RFM Values 

### Recency Clustering 

In [5]:
#function for ordering cluster numbers
def order_cluster(cluster_field_name, target_field_name,df,ascending):
    new_cluster_field_name = 'new_' + cluster_field_name
    df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
    df_new = df_new.sort_values(by=target_field_name,ascending=ascending).reset_index(drop=True)
    df_new['index'] = df_new.index
    df_final = pd.merge(df,df_new[[cluster_field_name,'index']], on=cluster_field_name)
    df_final = df_final.drop([cluster_field_name],axis=1)
    df_final = df_final.rename(columns={"index":cluster_field_name})
    return df_final

In [6]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=4)
kmeans.fit(data[['Recency']])
data['RecencyCluster'] = kmeans.predict(data[['Recency']])

data = order_cluster('RecencyCluster', 'Recency',data,False)



In [7]:
data.head()

Unnamed: 0,msno,Monetary,Recency,Frequency,is_auto_renew,is_discount,BestPlan,active_days_per_month,total_usage_time_per_month,total_unique_songs_played_per_month,Percent_Songs_Played_25,Percent_Songs_Played_50,Percent_Songs_Played_100,is_churn,RecencyCluster
0,0,1599,160,395,0,0,395,26,117907.425,530,20.177022,3.48255,74.63663,0,1
1,7,1788,271,410,0,0,410,25,90177.554,582,28.582253,4.37489,64.295471,0,1
2,27,894,186,195,0,0,195,23,230427.576,1043,6.345546,0.880708,91.709406,0,1
3,119,1788,169,410,0,0,410,29,143942.283,258,14.305981,6.913267,71.491911,1,1
4,124,536,178,180,0,0,180,12,51199.588,238,24.029165,5.266275,58.750749,0,1


In [8]:
data['RecencyCluster'].value_counts()

3    86161
2     5092
1     3066
0     2777
Name: RecencyCluster, dtype: int64

In [9]:
data.groupby('RecencyCluster')['Recency'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
RecencyCluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,2777.0,361.447245,50.866337,287.0,324.0,358.0,392.0,811.0
1,3066.0,210.830724,38.050345,159.0,177.0,202.0,243.0,286.0
2,5092.0,105.996072,28.175496,61.0,81.0,104.0,130.0,158.0
3,86161.0,15.55011,11.025457,1.0,6.0,15.0,24.0,60.0


### Subscription Frequency Clustering

In [10]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=4)
kmeans.fit(data[['Frequency']])
data['FrequencyCluster'] = kmeans.predict(data[['Frequency']])

data = order_cluster('FrequencyCluster', 'Frequency',data,True)



In [11]:
data.head()

Unnamed: 0,msno,Monetary,Recency,Frequency,is_auto_renew,is_discount,BestPlan,active_days_per_month,total_usage_time_per_month,total_unique_songs_played_per_month,Percent_Songs_Played_25,Percent_Songs_Played_50,Percent_Songs_Played_100,is_churn,RecencyCluster,FrequencyCluster
0,0,1599,160,395,0,0,395,26,117907.425,530,20.177022,3.48255,74.63663,0,1,2
1,7,1788,271,410,0,0,410,25,90177.554,582,28.582253,4.37489,64.295471,0,1,2
2,119,1788,169,410,0,0,410,29,143942.283,258,14.305981,6.913267,71.491911,1,1,2
3,145,1788,229,410,0,0,410,31,191626.664,643,15.398857,3.980268,76.323401,0,1,2
4,160,1788,208,410,0,0,410,1,790.83,5,50.0,0.0,50.0,0,1,2


In [12]:
data['FrequencyCluster'].value_counts()

0    82950
2     8556
1     5119
3      471
Name: FrequencyCluster, dtype: int64

In [13]:
data.groupby('FrequencyCluster')['Frequency'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
FrequencyCluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,82950.0,34.787233,13.292939,0.0,30.0,30.0,30.0,107.0
1,5119.0,186.223481,23.96876,120.0,180.0,195.0,195.0,292.0
2,8556.0,406.243922,19.963473,300.0,410.0,410.0,410.0,540.0
3,471.0,712.350318,151.696275,560.0,600.0,630.0,820.0,1680.0


### Monetary Clustering

In [14]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=4)
kmeans.fit(data[['Monetary']])
data['MonetaryCluster'] = kmeans.predict(data[['Monetary']])

data = order_cluster('MonetaryCluster', 'Monetary',data,True)



In [15]:
data.head()

Unnamed: 0,msno,Monetary,Recency,Frequency,is_auto_renew,is_discount,BestPlan,active_days_per_month,total_usage_time_per_month,total_unique_songs_played_per_month,Percent_Songs_Played_25,Percent_Songs_Played_50,Percent_Songs_Played_100,is_churn,RecencyCluster,FrequencyCluster,MonetaryCluster
0,0,1599,160,395,0,0,395,26,117907.425,530,20.177022,3.48255,74.63663,0,1,2,2
1,7,1788,271,410,0,0,410,25,90177.554,582,28.582253,4.37489,64.295471,0,1,2,2
2,119,1788,169,410,0,0,410,29,143942.283,258,14.305981,6.913267,71.491911,1,1,2,2
3,145,1788,229,410,0,0,410,31,191626.664,643,15.398857,3.980268,76.323401,0,1,2,2
4,160,1788,208,410,0,0,410,1,790.83,5,50.0,0.0,50.0,0,1,2,2


In [16]:
data['MonetaryCluster'].value_counts()

0    81734
2     8523
1     6601
3      238
Name: MonetaryCluster, dtype: int64

In [17]:
data.groupby('MonetaryCluster')['Monetary'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
MonetaryCluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,81734.0,146.865931,62.089107,0.0,99.0,149.0,149.0,427.0
1,6601.0,715.02439,216.084533,446.0,536.0,699.0,894.0,1211.0
2,8523.0,1765.667371,155.873563,1251.0,1788.0,1788.0,1788.0,2651.0
3,238.0,3541.789916,812.036201,2671.0,2991.5,3387.0,3576.0,8850.0


### Songs Played Frequency Clustering

In [18]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=4)
kmeans.fit(data[['total_usage_time_per_month']])
data['SongsPlayedFrequencyCluster'] = kmeans.predict(data[['total_usage_time_per_month']])

data = order_cluster('SongsPlayedFrequencyCluster', 'total_usage_time_per_month',data,True)



In [19]:
data.head()

Unnamed: 0,msno,Monetary,Recency,Frequency,is_auto_renew,is_discount,BestPlan,active_days_per_month,total_usage_time_per_month,total_unique_songs_played_per_month,Percent_Songs_Played_25,Percent_Songs_Played_50,Percent_Songs_Played_100,is_churn,RecencyCluster,FrequencyCluster,MonetaryCluster,SongsPlayedFrequencyCluster
0,0,1599,160,395,0,0,395,26,117907.425,530,20.177022,3.48255,74.63663,0,1,2,2,0
1,7,1788,271,410,0,0,410,25,90177.554,582,28.582253,4.37489,64.295471,0,1,2,2,0
2,160,1788,208,410,0,0,410,1,790.83,5,50.0,0.0,50.0,0,1,2,2,0
3,187,1788,219,410,0,0,410,12,48210.151,364,38.219879,32.594632,23.739586,1,1,2,2,0
4,300,1788,166,410,0,0,410,28,88209.978,461,17.378106,2.824991,77.837649,0,1,2,2,0


In [20]:
data['SongsPlayedFrequencyCluster'].value_counts()

0    62463
1    26163
2     6880
3     1590
Name: SongsPlayedFrequencyCluster, dtype: int64

In [21]:
data.groupby('SongsPlayedFrequencyCluster')['total_usage_time_per_month'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
SongsPlayedFrequencyCluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,62463.0,41798.44,35301.474116,0.131,9212.0505,34010.877,69724.53,118782.059
1,26163.0,196176.6,58379.002143,118790.527,146778.989,182905.1,237188.0,337135.926
2,6880.0,478917.6,113476.531659,337165.287,382567.65325,449000.398,554587.8,768355.858
3,1590.0,1058967.0,305011.488944,768545.191,852823.62125,962767.2725,1161660.0,3028970.526


## Calculating the Overall Score and Creating Segment Column

### Overall Score 

In [22]:
data['OverallScore'] = data['FrequencyCluster']+data['SongsPlayedFrequencyCluster']+data['RecencyCluster']+data['MonetaryCluster']

In [23]:
data.head()

Unnamed: 0,msno,Monetary,Recency,Frequency,is_auto_renew,is_discount,BestPlan,active_days_per_month,total_usage_time_per_month,total_unique_songs_played_per_month,Percent_Songs_Played_25,Percent_Songs_Played_50,Percent_Songs_Played_100,is_churn,RecencyCluster,FrequencyCluster,MonetaryCluster,SongsPlayedFrequencyCluster,OverallScore
0,0,1599,160,395,0,0,395,26,117907.425,530,20.177022,3.48255,74.63663,0,1,2,2,0,5
1,7,1788,271,410,0,0,410,25,90177.554,582,28.582253,4.37489,64.295471,0,1,2,2,0,5
2,160,1788,208,410,0,0,410,1,790.83,5,50.0,0.0,50.0,0,1,2,2,0,5
3,187,1788,219,410,0,0,410,12,48210.151,364,38.219879,32.594632,23.739586,1,1,2,2,0,5
4,300,1788,166,410,0,0,410,28,88209.978,461,17.378106,2.824991,77.837649,0,1,2,2,0,5


In [24]:
data['OverallScore'].value_counts()

3     53135
4     26056
5     10148
6      4436
7      1960
8       729
9       286
2       195
10       68
0        37
1        26
11       16
12        4
Name: OverallScore, dtype: int64

### Segmentation 

In [25]:
data.groupby('OverallScore')['Recency','Frequency','Monetary','total_usage_time_per_month'].mean()

  data.groupby('OverallScore')['Recency','Frequency','Monetary','total_usage_time_per_month'].mean()


Unnamed: 0_level_0,Recency,Frequency,Monetary,total_usage_time_per_month
OverallScore,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,436.081081,43.378378,206.459459,38113.01
1,353.192308,51.0,202.653846,122856.3
2,161.615385,98.876923,396.34359,46822.04
3,16.908723,35.517117,153.933358,42068.17
4,43.964231,69.999616,299.221715,172935.2
5,76.886973,144.109283,611.009361,318839.3
6,96.522317,253.864968,1071.383003,436063.3
7,83.029082,391.40051,1658.008163,217347.7
8,60.012346,457.847737,1881.611797,303701.6
9,40.618881,593.045455,2534.013986,354710.3


In [26]:
data['Segment'] = 'Low-Value'
data.loc[data['OverallScore']>3,'Segment'] = 'Mid-Value' 
data.loc[data['OverallScore']>5,'Segment'] = 'High-Value' 

In [27]:
data.head()

Unnamed: 0,msno,Monetary,Recency,Frequency,is_auto_renew,is_discount,BestPlan,active_days_per_month,total_usage_time_per_month,total_unique_songs_played_per_month,Percent_Songs_Played_25,Percent_Songs_Played_50,Percent_Songs_Played_100,is_churn,RecencyCluster,FrequencyCluster,MonetaryCluster,SongsPlayedFrequencyCluster,OverallScore,Segment
0,0,1599,160,395,0,0,395,26,117907.425,530,20.177022,3.48255,74.63663,0,1,2,2,0,5,Mid-Value
1,7,1788,271,410,0,0,410,25,90177.554,582,28.582253,4.37489,64.295471,0,1,2,2,0,5,Mid-Value
2,160,1788,208,410,0,0,410,1,790.83,5,50.0,0.0,50.0,0,1,2,2,0,5,Mid-Value
3,187,1788,219,410,0,0,410,12,48210.151,364,38.219879,32.594632,23.739586,1,1,2,2,0,5,Mid-Value
4,300,1788,166,410,0,0,410,28,88209.978,461,17.378106,2.824991,77.837649,0,1,2,2,0,5,Mid-Value


In [28]:
data['BestPlan'].value_counts()

30     82032
410     6457
195     2181
180     1782
90      1306
395      812
7        360
100      344
415      282
365      272
200      238
360      209
120      186
240      139
60       136
450      135
400      126
270       60
1         25
10         6
14         4
80         2
45         1
0          1
Name: BestPlan, dtype: int64

In [29]:
data['Plan'] = 'Yearly Plan'
data.loc[(data['BestPlan']<7) & (data['BestPlan']>=0),'Plan'] = 'Daily'
data.loc[(data['BestPlan']<30) & (data['BestPlan']>=7),'Plan'] = 'Weekly'
data.loc[(data['BestPlan']<90) & (data['BestPlan']>=30), 'Plan'] = 'Monthly'
data.loc[(data['BestPlan']<180) & (data['BestPlan']>=90), 'Plan'] = 'Three Months'
data.loc[(data['BestPlan']<300) & (data['BestPlan']>=180), 'Plan'] = 'Six Months'
data.loc[data['BestPlan']>=300,'Plan'] = 'Yearly'

In [30]:
data['Plan'].value_counts()

Monthly         82171
Yearly           8293
Six Months       4400
Three Months     1836
Weekly            370
Daily              26
Name: Plan, dtype: int64

In [31]:
data['Segment'].value_counts()

Low-Value     53393
Mid-Value     36204
High-Value     7499
Name: Segment, dtype: int64

In [32]:
data.head()

Unnamed: 0,msno,Monetary,Recency,Frequency,is_auto_renew,is_discount,BestPlan,active_days_per_month,total_usage_time_per_month,total_unique_songs_played_per_month,...,Percent_Songs_Played_50,Percent_Songs_Played_100,is_churn,RecencyCluster,FrequencyCluster,MonetaryCluster,SongsPlayedFrequencyCluster,OverallScore,Segment,Plan
0,0,1599,160,395,0,0,395,26,117907.425,530,...,3.48255,74.63663,0,1,2,2,0,5,Mid-Value,Yearly
1,7,1788,271,410,0,0,410,25,90177.554,582,...,4.37489,64.295471,0,1,2,2,0,5,Mid-Value,Yearly
2,160,1788,208,410,0,0,410,1,790.83,5,...,0.0,50.0,0,1,2,2,0,5,Mid-Value,Yearly
3,187,1788,219,410,0,0,410,12,48210.151,364,...,32.594632,23.739586,1,1,2,2,0,5,Mid-Value,Yearly
4,300,1788,166,410,0,0,410,28,88209.978,461,...,2.824991,77.837649,0,1,2,2,0,5,Mid-Value,Yearly


In [33]:
data.to_csv('TransformedDataset/transformed_user_summary_segemented.csv',index=False)