# Mock Data Creation

In [1]:
import pandas as pd
import mock_data_script as mds

In [2]:
def mock_data():
    weeks = mds.generate_weeks()

    ids = mds.generate_IMEI()
    IMEIs = []
    credits_loaded = []
    num_hrs = []
    mdu = []
    promos = []
    times = []
    active = []

    for _ in range(len(weeks)):
        IMEIs.append(ids)

        credit = mds.generate_credit_load()
        hrs = mds.generate_num_of_hrs()
        md_usage = mds.generate_md_usage()
        promo = mds.generate_promos()
        active_time = mds.generate_active_time()
        activity = credit + hrs + md_usage + promo + active_time

        credits_loaded.append(credit)
        num_hrs.append(hrs)
        mdu.append(md_usage)
        promos.append(promo)
        times.append(active_time)

        if activity == 0:
            active.append(1)
        else:
            active.append(0)

    for i in range(len(weeks)):
        if i == len(weeks)-4 or i == len(weeks)-3 or i == len(weeks)-2 or i == len(weeks)-1:
            credits_loaded[i] = 0
            num_hrs[i] = 0
            mdu[i] = 0
            promos[i] = 0
            times[i] = 0
            active[i] = 1

    return weeks, IMEIs, credits_loaded, num_hrs, mdu, promos, times, active

In [3]:
d = []

for _ in range(10):
    weeks, IMEIs, credits_loaded, num_hrs, mdu, promos, times, active = mock_data()
    d.append(pd.DataFrame({
        'weeks':weeks, 
        'IMEI':IMEIs,
        'credits_loaded_per_week':credits_loaded,
        'num_hrs_spend_on_net':num_hrs,
        'mobile_data_usage(gb)':mdu,
        'promos_used':promos,
        'active_times':times,
        'activity_status':active
        }))

In [4]:
d[0]

Unnamed: 0,weeks,IMEI,credits_loaded_per_week,num_hrs_spend_on_net,mobile_data_usage(gb),promos_used,active_times,activity_status
0,2019-02-27,156044289179301,0,3.0,6.01,0,1,0
1,2019-03-06,156044289179301,300,2.4,8.78,0,1,0
2,2019-03-13,156044289179301,0,21.8,5.1,0,2,0
3,2019-03-20,156044289179301,90,6.0,6.2,0,0,0
4,2019-03-27,156044289179301,500,12.2,6.29,3,2,0
5,2019-04-03,156044289179301,0,18.4,5.89,0,0,0
6,2019-04-10,156044289179301,0,19.2,6.34,0,1,0
7,2019-04-17,156044289179301,0,21.2,6.67,4,1,0
8,2019-04-24,156044289179301,70,13.7,7.13,4,1,0
9,2019-05-01,156044289179301,0,9.9,8.12,4,0,0


In [5]:
len(d[0])

41

In [6]:
len(d)

10

In [7]:
with pd.ExcelWriter('data/mock/sample_mock_rc_data.xlsx') as writer:
    for i in range(len(d)):
        d[i].to_excel(writer, sheet_name=('d{}'.format(i)), index=False)
#_.to_csv('data/mock/sample_mock_rc_data.csv','possible_rc')

# Analyzing Datas for Rotational Churners

In [8]:
import numpy as np
import pandas as pd

dfs = pd.ExcelFile('data/mock/sample_mock_rc_data.xlsx')
sheets = dfs.sheet_names
d = []
for i in range(10):
    d.append(dfs.parse(sheet_name=sheets[i]))

In [9]:
def Euclidean_Dist(df1, df2, cols=d[0].columns[2:]):
    return np.linalg.norm(df1[cols].values - df2[cols].values,
                   axis=1)

In [10]:
d[0]

Unnamed: 0,weeks,IMEI,credits_loaded_per_week,num_hrs_spend_on_net,mobile_data_usage(gb),promos_used,active_times,activity_status
0,2019-02-27,156044289179301,0,3.0,6.01,0,1,0
1,2019-03-06,156044289179301,300,2.4,8.78,0,1,0
2,2019-03-13,156044289179301,0,21.8,5.1,0,2,0
3,2019-03-20,156044289179301,90,6.0,6.2,0,0,0
4,2019-03-27,156044289179301,500,12.2,6.29,3,2,0
5,2019-04-03,156044289179301,0,18.4,5.89,0,0,0
6,2019-04-10,156044289179301,0,19.2,6.34,0,1,0
7,2019-04-17,156044289179301,0,21.2,6.67,4,1,0
8,2019-04-24,156044289179301,70,13.7,7.13,4,1,0
9,2019-05-01,156044289179301,0,9.9,8.12,4,0,0


In [11]:
df1 = d[0].iloc[:-4, :]
df2 = d[4].iloc[:len(df1), :]
print(len(df1), ' ', len(df2))

37   37


In [12]:
x = df1.groupby('IMEI')['credits_loaded_per_week']
x

<pandas.core.groupby.generic.SeriesGroupBy object at 0x000001A7FB958B80>

In [13]:
distances = Euclidean_Dist(df1, df2)
distances

array([   2.59584283,  700.04752989,   17.65903735,  710.00808087,
        300.01070464, 1000.00817302,  100.03780086,  300.166128  ,
         70.30931731,    7.81368031,   10.0648696 ,    5.6800088 ,
          9.86026876,   90.02148021,  200.10298374,  800.05053972,
         81.28937261,  200.09266253,    4.38698074,  100.49566956,
         20.50819592,   31.82238363,  800.05470694,   31.81057214,
        800.07399664,  270.03258859, 1000.02149102,    5.21080608,
          6.11657584,  300.06475651,  500.04546043,   71.96832637,
        910.02089872,  300.0969385 ,  100.53783566,   50.31254714,
         18.42504003])

In [14]:
distances

array([   2.59584283,  700.04752989,   17.65903735,  710.00808087,
        300.01070464, 1000.00817302,  100.03780086,  300.166128  ,
         70.30931731,    7.81368031,   10.0648696 ,    5.6800088 ,
          9.86026876,   90.02148021,  200.10298374,  800.05053972,
         81.28937261,  200.09266253,    4.38698074,  100.49566956,
         20.50819592,   31.82238363,  800.05470694,   31.81057214,
        800.07399664,  270.03258859, 1000.02149102,    5.21080608,
          6.11657584,  300.06475651,  500.04546043,   71.96832637,
        910.02089872,  300.0969385 ,  100.53783566,   50.31254714,
         18.42504003])

In [15]:
distances.mean()

268.31957436507156

In [16]:
print('%.f%%' % (100 * (df1.values == df2.values).sum() / df1.size))

23%
