## Naive Bayes Classifier

In [12]:
import pandas as pd

# import data
df = pd.read_csv('../data/churn.txt', delimiter=',')

In [13]:
df.head()

Unnamed: 0,State,Account Length,Area Code,Phone,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,...,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn?
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False.
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False.
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False.
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False.
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False.


For the sake of this demonstration, we will only be using Int'l Plan and VMail Plan.

In [17]:
df2 = df[['Int\'l Plan', 'VMail Plan', 'Churn?']]
df2.head()

Unnamed: 0,Int'l Plan,VMail Plan,Churn?
0,no,yes,False.
1,no,yes,False.
2,no,no,False.
3,yes,no,False.
4,yes,no,False.


In [18]:
# rename columns
df2.rename(columns= 
           {'Int\'l Plan': 'International Plan',
            'VMail Plan': 'Voicemail Plan',
            'Churn?': 'Churn'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.rename(columns=


In [19]:
df2.head()

Unnamed: 0,International Plan,Voicemail Plan,Churn
0,no,yes,False.
1,no,yes,False.
2,no,no,False.
3,yes,no,False.
4,yes,no,False.


In [20]:
df2['Churn'] = df2['Churn'].map({'False.': 0,
                                 'True.': 1})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['Churn'] = df2['Churn'].map({'False.': 0,


In [21]:
df2

Unnamed: 0,International Plan,Voicemail Plan,Churn
0,no,yes,0
1,no,yes,0
2,no,no,0
3,yes,no,0
4,yes,no,0
...,...,...,...
3328,no,yes,0
3329,no,no,0
3330,no,no,0
3331,yes,no,0


In [22]:
df2['Churn'].value_counts()

Churn
0    2850
1     483
Name: count, dtype: int64

In [23]:
df2['International Plan'].value_counts()

International Plan
no     3010
yes     323
Name: count, dtype: int64

In [24]:
df2['Voicemail Plan'].value_counts()

Voicemail Plan
no     2411
yes     922
Name: count, dtype: int64

In [50]:
p_churn = 483 / (3333)
p_churn

0.14491449144914492

In [29]:
p_vm = 922 / 3333
p_vm

0.27662766276627665

In [30]:
p_ip = 323 / 3333
p_ip

0.0969096909690969

In [51]:
priors = pd.DataFrame({
    'Churn Prob': p_churn,
    'Voicemail Prob': p_vm,
    'International Prob': p_ip
}, index = [0])

priors

Unnamed: 0,Churn Prob,Voicemail Prob,International Prob
0,0.144914,0.276628,0.09691


Now we will find conditional probabilities starting with the international plan given Churn = False and the voicemail plan given churn = False. The first step is to filter down to churn = False, then from there count the number of individuals who have the international plan.

In [48]:
# international plan yes, churn 0
IP_and_churn_false = df2.loc[(df2['International Plan'] == 'yes') & (df2['Churn'] == 0)].shape[0]
i_c_no = round(IP_and_churn_false / 2850, 4)
print(f'The probability of someone who has churned and has the international plan is {i_c_no}.')

The probability of someone who has churned and has the international plan is 0.0653.


In [64]:
# voicemail plan yes, churn 0
v_and_churn_false = df2.loc[(df2['Voicemail Plan'] == 'yes') & (df2['Churn'] == 0)].shape[0]
v_c_no = round(v_and_churn_false / 2850, 4)
print(f'The probability of someone who has not churned and has the international plan is {v_c_no}.')

The probability of someone who has not churned and has the international plan is 0.2954.


In [90]:
# international plan yes, churn yes
ip_churn_true = df2[(df2['International Plan'] == 'yes') & (df2['Churn'] == 1)].shape[0]
ip_churn_true
IP_churn = ip_churn_true/483

# 483 people have churned and 137 have the plan
print(f'The probability that someone that churned also had the international plan is {IP_churn}')

The probability that someone that churned also had the international plan is 0.2836438923395445


In [95]:
# voicemail yes, churn yes
vm_churn_true = df2[(df2['Voicemail Plan'] == 'yes') & (df2['Churn'] == 1)].shape[0]
vm_churn_true
VM_churn= vm_churn_true/483

# 80 people have out of the 438 that churned
print(f' The probability that someone who churned had the voicemail plan is {VM_churn}')

 The probability that someone who churned had the voicemail plan is 0.16563146997929606


In [96]:
IP_churn

0.2836438923395445

In [98]:
IP_churn * VM_churn * p_churn

0.006808134229572026

##### Bayes Theorem

Now we can find the probability that someone churned given they had the voicemail plan, and the probability that someone churned given they had the international plan.

In [73]:
churned_given_vm = (round(vm_churn_true/483, 4) * p_churn) / (p_vm)
churned_given_vm

0.08675140997830802

In [76]:
churned_given_ip = round((ip_churn_true/483) * p_churn / (p_ip), 4)
churned_given_ip

0.4241

probability that a new customer with both plans will churn

In [79]:
(churned_given_ip * churned_given_vm) 

0.03679127297180043

In [80]:
0.2836 * 0.1656

0.046964160000000005

In [86]:
36 / (36 + 447)

0.07453416149068323

In [84]:
0.03681188 * 0.1449

0.0053340414119999995