# Create Churn Dataset
This notebook generates a dataset consisting `num_persons = 100000` customers and exports it to a python pickle file. In `.assets/data/churn` there is a precomputed compressed pickle file which can be directly be read in with `pd.read_pickle('../.assets/data/churn/churn_persona.pkl.zip')`.

All features are generated by a simulation from random distributions with some underlying assumption of how people (could) behave.
In the end, all customers have two possible states. Either they churn or they do not churn. Under the hood we implemented three types of costumers. These types determine how customers behave if they churn and how the features for our simulation are set. For example, an angry costumer has a higher churn probability than a standard costumer. But still, these standard costumers will have churn rates greater than zero. In addition, we have "sleepy" costumers which behave like standard costumers but have a higher churn rate, if woken up (by a call or an e-mail). Sleepy customers should not be woken up in most churn scenarios. They make it more difficult to generate an efficient model to detect churn.

#### Some Words About Toy Data
High quality datasets are hard to find in reality. As a matter of fact, in many cases the preparations for high quality data taking take a lot more time, than large parts of the actual data analysis. However, starting early with analysis projects ensure, that you know at least some of the traps before you start datataking.

Thus, building simplified models to generate datasets from first principles is a usual way to get around. With such models you can learn bringing up the machinery and start data taking at the same time. Our dataset is such a toy set. So in several aspects it might not reflect reality at 100%. But, it still holds some key features of real data.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import scipy
import scipy.stats as stats

import time
from tqdm import tqdm_notebook as tqdm


In [None]:
class ChurnParams():
    def __init__(self):
        # churn types
        # 1. angry
        # 2. sleep
        # 3. dynamic/standard   
        
        self.rates = [0.03,0.01,0.14]
        self.type_probs = [0.5,0.6,1]
        
    def get_rate(self, i):
        return self.rates[i-1]
    
    def get_type(self):
        p = np.random.rand()
        for i,prob in enumerate(self.type_probs):
            if p <= prob:
                return i+1
            

In [None]:
class Person():
    def __init__(self):
        self.params = ChurnParams()
        
        self.age = self.create_age()
        self.bank = self.create_bank()
        self.mail = self.create_mail()
        self.size = self.create_size()
        self.year = self.create_year()
        self.amount = self.create_amount()
        
        self.type = self.create_churn_type()
        
        #develop person
        self.churn = self.create_churn()
        
        self.contacts = self.create_contacts()
        self.delta_amount = self.create_delta_amount()
        self.pay = self.create_pay()
        self.delta_pay = self.create_delta_pay()
        
        self.change_attributes()
        
        
    def create_age(self):
        age = 0
        while age < 16 or age > 110:
            age = np.random.normal(loc=50, scale=15)
        return age

    def create_bank(self):
        age = self.age
        
        banks = [
            'Interbank',
            'Solidbank',
            'Stadtbank',
            'Volkskasse',
            'Sparbank',
        ]

        ps = []
        ps.append(stats.norm.pdf(age,loc=25,scale=10))
        ps.append(stats.norm.pdf(age,loc=35,scale=20))
        ps.append(stats.norm.pdf(age,loc=50,scale=25))
        ps.append(stats.norm.pdf(age,loc=65,scale=40))
        ps.append(stats.norm.pdf(age,loc=40,scale=50))

        values = [np.random.rand() * p for p in ps]

        return banks[values.index(max(values))]
    
    def create_mail(self):
        age = self.age
        mails = [
            'nice.mail',
            'e-mail.com',
            'mail.de',
            'brief.de',
            'None',
        ]

        ps = []
        ps.append(stats.norm.pdf(age,loc=20,scale=5))
        ps.append(stats.norm.pdf(age,loc=25,scale=10))
        ps.append(stats.norm.pdf(age,loc=40,scale=25))
        ps.append(stats.norm.pdf(age,loc=65,scale=20))
        ps.append(stats.norm.pdf(age,loc=80,scale=15))

        values = [np.random.rand() * p for p in ps]

        return mails[values.index(max(values))] 
    
    def create_size(self):
        age = self.age
        sizes = [
            1,
            2,
            3,
            4,
            5,
        ]

        ps = []
        ps.append(stats.norm.pdf(age,loc=40,scale=150)*5)
        ps.append(stats.norm.pdf(age,loc=50,scale=100)*3)
        ps.append(stats.norm.pdf(age,loc=40,scale=15)*0.4)
        ps.append(stats.norm.pdf(age,loc=45,scale=20)*0.3)
        ps.append(stats.norm.pdf(age,loc=45,scale=20)*0.2)

        values = [np.random.rand() * p for p in ps]

        return sizes[values.index(max(values))] 
    
    def create_year(self):
        age = self.age
        years = int(np.random.rand() * (age - 15))

        return 2018 - years
    
    def create_amount(self):
        age = self.age
        size = self.size
        
        base = 700
        base += (np.random.normal(loc=300, scale=300) + 500) * size

        return base
    
    def get_data(self):
        person = {
            'age'    : self.age,
            'size'   : self.size,
            'bank'   : self.bank,
            'mail'  : self.mail,
            'year'  : self.year,
            'amount' : self.amount,
            'd_amount' : self.delta_amount,
            'contacts' : self.contacts,
            'pay' : self.pay,
            'd_pay' : self.delta_pay,
            'churn' : self.churn
        }

        return person
    
    def create_churn_type(self):
        return self.params.get_type()
        
    def create_churn(self):
        if self.type > 0:
            return np.random.rand() < self.params.get_rate(self.type)
        else:
            return False
        
    def create_contacts(self):
        n = 1
        if self.churn:
            if self.type == 1:
                n += np.random.normal(loc=3, scale=2)
            elif self.type == 2:
                n += np.random.normal(loc=0, scale=1)
        n += np.random.normal(loc=0, scale=3)
        
        return int(np.abs(n))+1
    
    def create_delta_amount(self):
        scale = np.random.normal(loc=0.02, scale=0.03)
        
        if self.churn:
            if self.type == 1:
                scale += np.random.normal(loc=0.05, scale=0.03)
                scale = np.abs(scale)
            elif self.type == 2:
                scale += np.random.normal(loc=0.02, scale=0.03)
            elif self.type == 3:
                scale += np.random.normal(loc=0.03, scale=0.02)
                
        return self.amount * scale
    
    def create_pay(self):
        return self.amount * (
            (0.22 * 1) + 
            (np.floor(np.random.rand()*3) * 0.03)
        )
    
    def create_delta_pay(self):
        return self.pay / self.amount * self.delta_amount
        
    def change_attributes(self):
        if self.churn:
            if self.type == 1:
                bank = 0
                mail = 0
                age = 3
            elif self.type == 2:
                bank = 0.2
                mail = 0.3
                age = 5
            elif self.type == 3:
                bank = 0.2
                mail = 0.3
                age = -7
                
            if np.random.rand() < bank  and self.mail != 'None':
                if np.random.rand() < 0.5:
                    self.bank = 'Interbank'
                else:
                    self.bank = 'Solidbank'
            elif self.mail == 'None':
                if np.random.rand() < 0.6:
                    self.mail = self.create_mail()
                if np.random.rand() < 0.7:
                    if np.random.rand() < 0.5:
                        self.bank = 'Interbank'
                    else:
                        self.bank = 'Solidbank'
                
                
            if np.random.rand() < mail and self.mail == 'None':
                self.mail = self.create_mail()
            elif np.random.rand() < mail/2:
                if np.random.rand() < 0.5:
                    self.mail = 'nice.mail'
                else:
                    self.mail = 'e-mail.com'

            age = self.age + np.random.normal(loc = age, scale = 10)
            age_diff = self.age - age
            if age > 16:
                self.age = age
                self.year = self.create_year()

## Create dataset

In [None]:
# set dataset size
num_persons = 1000

In [None]:
t_start = time.time()
ps = [Person().get_data() for i in tqdm(range(num_persons))]
duration_creation = time.time() - t_start
print(f'Creation took {duration_creation:.2f} s ({num_persons/duration_creation:.2f} it/s)')
df = pd.DataFrame(ps)

## Add Features

Add deduced quantities of one column to the dataset
- ratio r: Ratio between costumers who churn and do not churn
- error s: Describes the corresponding uncertainty of this ratio r (highly dependent on sample size)
- index n: ID of each unique entry in this column

In [None]:
def add_ratios(df, column):

    n1 = df[df['churn'] == True].groupby(column)[column].count()
    n2 = df[df['churn'] == False].groupby(column)[column].count()
    
    #ratio and error
    r=n1/n2
    n=n1+n2
    s = np.sqrt((r*(1-r)/n))

    
    index = np.arange(len(df.groupby(column)[column].count().index))+1
    
    dtest = pd.DataFrame(np.transpose([r,s,index]))
    dtest.columns=[column+'_r',column+'_s',column+'_n']
    dtest.index=df.groupby(column)[column].count().index

    print(dtest)
    return df.join(dtest, on=column)

In [None]:
## add ratio features to dataset
dn = add_ratios(df, 'bank')
dn = add_ratios(dn, 'mail')
dn = add_ratios(dn, 'contacts')

## Save dataset to pickle file

In [None]:
dn.to_pickle('churn_persona.pkl')

In [None]:
dn = pd.read_pickle('churn_persona.pkl')

## Plot some statistics of generated data

### How many costumers churn?

In [None]:
len(df[df['churn']])

### All features

In [None]:
df.columns

In [None]:
plt.figure(figsize = (15,15))

ax = plt.subplot(321)
df.age.plot(kind='hist', ax=ax, bins = 21)
ax.set_xlabel('age')

ax = plt.subplot(322)
df.groupby('bank')['bank'].count().plot(kind='bar', ax=ax)

ax = plt.subplot(323)
df.groupby('mail')['mail'].count().plot(kind='bar', ax=ax)

ax = plt.subplot(324)
df.groupby('size')['size'].count().plot(kind='bar', ax=ax)

ax = plt.subplot(325)
df.groupby('year')['year'].count().plot(kind='bar', ax=ax)

ax = plt.subplot(326)
df.amount.plot(kind='hist', ax=ax, bins = 21)
ax.set_xlabel('amount')

plt.tight_layout();

In [None]:
plt.figure(figsize = (15,5))
feature = 'age'
bins = np.linspace(df[feature].min(), df[feature].max(), 21)
df[df['churn'] == False][feature].plot(kind='hist', density=True, alpha = 0.5, bins=bins)
df[df['churn'] == True][feature].plot(kind='hist', density=True, alpha= 0.5, bins=bins);

In [None]:
plt.figure(figsize = (15,5))
(df[df['churn'] == True].groupby('mail')['mail'].count()/
df[df['churn'] == False].groupby('mail')['mail'].count()).plot(kind='bar', alpha=0.5)
plt.tight_layout();

In [None]:
plt.figure(figsize = (15,5))
(df[df['churn'] == True].groupby('bank')['bank'].count()/
df[df['churn'] == False].groupby('bank')['bank'].count()).plot(kind='bar', alpha=0.5)
plt.tight_layout();

In [None]:
plt.figure(figsize = (15,5))
(df[df['churn'] == True].groupby('contacts')['contacts'].count()/
df[df['churn'] == False].groupby('contacts')['contacts'].count()).plot(kind='bar', alpha=0.5)
plt.tight_layout();

In [None]:
plt.figure(figsize = (15,5))
feature = 'contacts'
bins = np.linspace(df[feature].min(), df[feature].max(), 20)
df[df['churn'] == False][feature].plot(kind='hist', density=True, alpha = 0.5, bins=bins)
df[df['churn'] == True][feature].plot(kind='hist', density=True, alpha=0.5, bins=bins);

---
_This notebook is licensed under a [Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0)](https://creativecommons.org/licenses/by-nc-sa/4.0/). Copyright © 2018-2025 [Point 8 GmbH](https://point-8.de)_