In [2]:
# 목적: 로지스틱 모델을 통해 이탈 고객 예측하기
import numpy as np
import pandas as pd
import statsmodels.api as sm

# Read the data set into a pandas DataFrame
churn = pd.read_csv('churn.csv', sep=',', header=0)

churn.columns = [heading.lower() for heading in \
churn.columns.str.replace(' ', '_').str.replace("\'", "").str.strip('?')]
churn['churn'] = np.where(churn['churn'] == 'True.', 1, 0)
churn['total_charges'] = churn['day_charge'] + churn['eve_charge'] + \
						 churn['night_charge'] + churn['intl_charge']
churn['intl_plan'] = np.where(churn['intl_plan'] == 'yes', 1, 0)
churn['vmail_plan'] = np.where(churn['vmail_plan'] == 'yes', 1, 0)

# Fit a logistic regression model
dependent_variable = churn['churn']

# R추천
independent_variables = churn[['intl_plan', 'vmail_plan', 'vmail_message', 'day_charge', 'eve_mins', 'night_charge', 'intl_calls', 'intl_charge', 'custserv_calls']]
independent_variables_with_constant = sm.add_constant(independent_variables, prepend=True)
logit_model = sm.Logit(dependent_variable, independent_variables_with_constant).fit()

new_observatios = churn.loc[:, independent_variables.columns]
new_observatios_with_constant = sm.add_constant(new_observatios, prepend=True)
y_predicted = logit_model.predict(new_observatios_with_constant)
y_predicted_rounded=[round(score,0) for score in y_predicted]
logistic_predicted_value_list=[]

total_count = 0
index = 0
total_number = len(y_predicted_rounded)
total_correct = 0

while index < total_number:
    if index<20:
        print(f'{index+1}\t|{y_predicted_rounded[index]}\t|{dependent_variable[index]}')
    if y_predicted_rounded[index] == dependent_variable [index]:
        total_correct += 1
    index+=1

print(f'\n전체 관찰 계수: {total_number}')
print(f'정답수: {total_correct}')
print(f'정답률: {(total_correct/total_number)*100} %')



Optimization terminated successfully.
         Current function value: 0.324276
         Iterations 7
1	|0.0	|0
2	|0.0	|0
3	|0.0	|0
4	|1.0	|0
5	|1.0	|0
6	|0.0	|0
7	|0.0	|0
8	|0.0	|0
9	|0.0	|0
10	|0.0	|0
11	|0.0	|1
12	|0.0	|0
13	|0.0	|0
14	|0.0	|0
15	|0.0	|0
16	|1.0	|1
17	|0.0	|0
18	|0.0	|0
19	|0.0	|0
20	|0.0	|0

전체 관찰 계수: 3333
정답수: 2877
정답률: 86.31863186318633 %


In [3]:
# 목적: 로지스틱 모델을 통해 이탈 고객 예측하기
import numpy as np
import pandas as pd
import statsmodels.api as sm

# Read the data set into a pandas DataFrame
churn = pd.read_csv('churn.csv', sep=',', header=0)

churn.columns = [heading.lower() for heading in \
churn.columns.str.replace(' ', '_').str.replace("\'", "").str.strip('?')]
churn['churn'] = np.where(churn['churn'] == 'True.', 1, 0)
churn['total_charges'] = churn['day_charge'] + churn['eve_charge'] + \
						 churn['night_charge'] + churn['intl_charge']
churn['intl_plan'] = np.where(churn['intl_plan'] == 'yes', 1, 0)
churn['vmail_plan'] = np.where(churn['vmail_plan'] == 'yes', 1, 0)

# Fit a logistic regression model
dependent_variable = churn['churn']

# R추천 + total_charges 추가
independent_variables = churn[['intl_plan', 'vmail_plan', 'vmail_message', 'day_charge', 'eve_mins', 'night_charge', 'intl_calls', 'intl_charge', 'custserv_calls','total_charges']]
independent_variables_with_constant = sm.add_constant(independent_variables, prepend=True)
logit_model = sm.Logit(dependent_variable, independent_variables_with_constant).fit()

new_observatios = churn.loc[:, independent_variables.columns]
new_observatios_with_constant = sm.add_constant(new_observatios, prepend=True)
y_predicted = logit_model.predict(new_observatios_with_constant)
y_predicted_rounded=[round(score,0) for score in y_predicted]
logistic_predicted_value_list=[]

total_count = 0
index = 0
total_number = len(y_predicted_rounded)
total_correct = 0

while index < total_number:
    if index<20:
        print(f'{index+1}\t|{y_predicted_rounded[index]}\t|{dependent_variable[index]}')
    if y_predicted_rounded[index] == dependent_variable [index]:
        total_correct += 1
    index+=1

print(f'\n전체 관찰 계수: {total_number}')
print(f'정답수: {total_correct}')
print(f'정답률: {(total_correct/total_number)*100} %')


Optimization terminated successfully.
         Current function value: 0.324244
         Iterations 8
1	|0.0	|0
2	|0.0	|0
3	|0.0	|0
4	|1.0	|0
5	|1.0	|0
6	|0.0	|0
7	|0.0	|0
8	|0.0	|0
9	|0.0	|0
10	|0.0	|0
11	|0.0	|1
12	|0.0	|0
13	|0.0	|0
14	|0.0	|0
15	|0.0	|0
16	|1.0	|1
17	|0.0	|0
18	|0.0	|0
19	|0.0	|0
20	|0.0	|0

전체 관찰 계수: 3333
정답수: 2876
정답률: 86.28862886288628 %


* AB테스트 결과  
intl_plan,vmail_plan은 모델 성능을 높이는데 기여했으나 total_charges는 기여를 하지 않은 것으로 판단

---

In [5]:
churn.head()

Unnamed: 0,state,account_length,area_code,phone,intl_plan,vmail_plan,vmail_message,day_mins,day_calls,day_charge,...,eve_charge,night_mins,night_calls,night_charge,intl_mins,intl_calls,intl_charge,custserv_calls,churn,total_charges
0,KS,128,415,382-4657,0,1,25,265.1,110,45.07,...,16.78,244.7,91,11.01,10.0,3,2.7,1,0,75.56
1,OH,107,415,371-7191,0,1,26,161.6,123,27.47,...,16.62,254.4,103,11.45,13.7,3,3.7,1,0,59.24
2,NJ,137,415,358-1921,0,0,0,243.4,114,41.38,...,10.3,162.6,104,7.32,12.2,5,3.29,0,0,62.29
3,OH,84,408,375-9999,1,0,0,299.4,71,50.9,...,5.26,196.9,89,8.86,6.6,7,1.78,2,0,66.8
4,OK,75,415,330-6626,1,0,0,166.7,113,28.34,...,12.61,186.9,121,8.41,10.1,3,2.73,3,0,52.09


In [15]:
# 목적: 로지스틱 모델을 통해 이탈 고객 예측하기
import numpy as np
import pandas as pd
import statsmodels.api as sm

# Read the data set into a pandas DataFrame
churn = pd.read_csv('churn.csv', sep=',', header=0)

churn.columns = [heading.lower() for heading in \
churn.columns.str.replace(' ', '_').str.replace("\'", "").str.strip('?')]
churn['churn'] = np.where(churn['churn'] == 'True.', 1, 0)
churn['total_charges'] = churn['day_charge'] + churn['eve_charge'] + \
						 churn['night_charge'] + churn['intl_charge']
churn['intl_plan'] = np.where(churn['intl_plan'] == 'yes', 1, 0)
churn['vmail_plan'] = np.where(churn['vmail_plan'] == 'yes', 1, 0)

# Fit a logistic regression model
dependent_variable = churn['churn']

# R추천 - charge시리즈 + total_charges 추가
#account_length, area_code, phone, day_mins, day_calls, day_charge, Eve Calls, Eve Charge, eve_charge, night_mins, night_calls, night_charge, intl_mins, intl_charge,
independent_variables = churn[['intl_plan', 'vmail_plan', 'day_mins', 'day_charge', 'eve_mins', 'eve_charge', 'night_mins', 'night_charge',
                               'intl_mins', 'intl_calls', 'intl_charge', 'custserv_calls', 'total_charges']]
independent_variables_with_constant = sm.add_constant(independent_variables, prepend=True)
logit_model = sm.Logit(dependent_variable, independent_variables_with_constant).fit()

new_observatios = churn.loc[:, independent_variables.columns]
new_observatios_with_constant = sm.add_constant(new_observatios, prepend=True)
y_predicted = logit_model.predict(new_observatios_with_constant)
y_predicted_rounded=[round(score,0) for score in y_predicted]
logistic_predicted_value_list=[]

total_count = 0
index = 0
total_number = len(y_predicted_rounded)
total_correct = 0

while index < total_number:
    if index<20:
        print(f'{index+1}\t|{y_predicted_rounded[index]}\t|{dependent_variable[index]}')
    if y_predicted_rounded[index] == dependent_variable [index]:
        total_correct += 1
    index+=1

print(f'\n전체 관찰 계수: {total_number}')
print(f'정답수: {total_correct}')
print(f'정답률: {(total_correct/total_number)*100} %')

         Current function value: 0.324734
         Iterations: 35
1	|0.0	|0
2	|0.0	|0
3	|0.0	|0
4	|0.0	|0
5	|1.0	|0
6	|0.0	|0
7	|0.0	|0
8	|0.0	|0
9	|0.0	|0
10	|0.0	|0
11	|0.0	|1
12	|0.0	|0
13	|0.0	|0
14	|0.0	|0
15	|0.0	|0
16	|1.0	|1
17	|0.0	|0
18	|0.0	|0
19	|0.0	|0
20	|0.0	|0

전체 관찰 계수: 3333
정답수: 2879
정답률: 86.37863786378638 %




In [36]:
# 목적: 로지스틱 모델을 통해 이탈 고객 예측하기
import numpy as np
import pandas as pd
import statsmodels.api as sm

# Read the data set into a pandas DataFrame
churn = pd.read_csv('churn.csv', sep=',', header=0)

churn.columns = [heading.lower() for heading in \
churn.columns.str.replace(' ', '_').str.replace("\'", "").str.strip('?')]
churn['churn'] = np.where(churn['churn'] == 'True.', 1, 0)
churn['total_charges'] = churn['day_charge'] + churn['eve_charge'] + \
						 churn['night_charge'] + churn['intl_charge']
churn['intl_plan'] = np.where(churn['intl_plan'] == 'yes', 1, 0)
churn['vmail_plan'] = np.where(churn['vmail_plan'] == 'yes', 1, 0)

# Fit a logistic regression model
dependent_variable = churn['churn']

# R추천 - charge시리즈 + total_charges 추가
#account_length, area_code, phone, , 'vmail_message', day_mins,  day_calls, eve_calls, , eve_charge, night_mins, night_calls, night_charge, intl_mins, intl_charge,
independent_variables = churn[['intl_plan', 'vmail_plan', 'day_mins', 'day_charge',
                               'eve_charge', 'night_mins', 'intl_mins',
                               'intl_calls', 'intl_charge', 'custserv_calls', 'total_charges']]
independent_variables_with_constant = sm.add_constant(independent_variables, prepend=True)
logit_model = sm.Logit(dependent_variable, independent_variables_with_constant).fit()

new_observatios = churn.loc[:, independent_variables.columns]
new_observatios_with_constant = sm.add_constant(new_observatios, prepend=True)
y_predicted = logit_model.predict(new_observatios_with_constant)
y_predicted_rounded=[round(score,0) for score in y_predicted]
logistic_predicted_value_list=[]

total_count = 0
index = 0
total_number = len(y_predicted_rounded)
total_correct = 0

while index < total_number:
    if index<20:
        print(f'{index+1}\t|{y_predicted_rounded[index]}\t|{dependent_variable[index]}')
    if y_predicted_rounded[index] == dependent_variable [index]:
        total_correct += 1
    index+=1

print(f'\n전체 관찰 계수: {total_number}')
print(f'정답수: {total_correct}')
print(f'정답률: {(total_correct/total_number)*100} %')

Optimization terminated successfully.
         Current function value: 0.324759
         Iterations 7
1	|0.0	|0
2	|0.0	|0
3	|0.0	|0
4	|0.0	|0
5	|1.0	|0
6	|0.0	|0
7	|0.0	|0
8	|0.0	|0
9	|0.0	|0
10	|0.0	|0
11	|0.0	|1
12	|0.0	|0
13	|0.0	|0
14	|0.0	|0
15	|0.0	|0
16	|1.0	|1
17	|0.0	|0
18	|0.0	|0
19	|0.0	|0
20	|0.0	|0

전체 관찰 계수: 3333
정답수: 2878
정답률: 86.34863486348635 %
