# Baseline Logistic Regression
ref: https://machinelearningmastery.com/roc-curves-and-precision-recall-curves-for-classification-in-python/

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as pyplot 

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib import pyplot


#display all at once
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from sklearn.preprocessing import OrdinalEncoder

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## 1. Import Data

In [None]:
data_df = pd.read_csv('/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
data_df.head()
data_df.shape

In [None]:
data_df.head()

In [None]:
# feature engineeing 
categorical_col = data_df.columns.tolist()
categorical_col = [el for el in categorical_col if el not in ['MonthlyCharges', 'SeniorCitizen', 'tenure', 'TotalCharges', 'customerID']]
ord_enc = OrdinalEncoder()

for col in categorical_col:
    print(col)
    data_df[col] = ord_enc.fit_transform(data_df[[col]])

In [None]:
data_df.head()

## 2. Train/Test Split 

In [None]:
feature_cols = data_df.columns.tolist()

feature_cols.remove('customerID') # unique ID so not needed
feature_cols.remove('Churn') # will be used as y label 
feature_cols.remove('TotalCharges') # roughly proportional to MonthlyCharges and tenure 

In [None]:
X = data_df[feature_cols]
y = data_df['Churn']

In [None]:
# train test split w/ stratifiying the Churn label
trainX, testX, trainy, testy = train_test_split(X, y, stratify=y, test_size=0.25)

# generate a no skill prediction (majority class)
ns_probs = [0 for _ in range(len(testy))]

## 3. Logistic Regression (Benchmark) Model

In [None]:
# fit a model
model = LogisticRegression(solver='lbfgs')
model.fit(trainX, trainy)

# predict probabilities
lr_probs = model.predict_proba(testX)

# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]

# calculate scores
ns_auc = roc_auc_score(testy, ns_probs)
lr_auc = roc_auc_score(testy, lr_probs)

# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Logistic: ROC AUC=%.3f' % (lr_auc))

# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(testy, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(testy, lr_probs)

# plot the roc curve for the model
pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='Predicting ALL as Not Churned (majority class)')
pyplot.plot(lr_fpr, lr_tpr, marker='.', label='Logistic')

# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')

# show the legend
pyplot.legend()

# show the plot
pyplot.show()