In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression

In [2]:
# Load in data
df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m13/lesson_1/datasets/customer-churn.csv')
df

Unnamed: 0,Call Failure,Complains,Subscription Length,Charge Amount,Seconds of Use,Frequency of use,Frequency of SMS,Distinct Called Numbers,Age Group,Tariff Plan,Status,Age,Customer Value,Churn
0,8,0,38,0,4370,71,5,17,3,1,1,30,197.640,0
1,0,0,39,0,318,5,7,4,2,1,2,25,46.035,0
2,10,0,37,0,2453,60,359,24,3,1,1,30,1536.520,0
3,10,0,38,0,4198,66,1,35,1,1,1,15,240.020,0
4,3,0,38,0,2393,58,2,33,1,1,1,15,145.805,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3145,21,0,19,2,6697,147,92,44,2,2,1,25,721.980,0
3146,17,0,17,1,9237,177,80,42,5,1,1,55,261.210,0
3147,13,0,18,4,3157,51,38,21,3,1,1,30,280.320,0
3148,7,0,11,2,4695,46,222,12,3,1,1,30,1077.640,0


In [3]:
# Drop the label to create the X data
X = df.drop('Churn', axis=1)
X.head()

Unnamed: 0,Call Failure,Complains,Subscription Length,Charge Amount,Seconds of Use,Frequency of use,Frequency of SMS,Distinct Called Numbers,Age Group,Tariff Plan,Status,Age,Customer Value
0,8,0,38,0,4370,71,5,17,3,1,1,30,197.64
1,0,0,39,0,318,5,7,4,2,1,2,25,46.035
2,10,0,37,0,2453,60,359,24,3,1,1,30,1536.52
3,10,0,38,0,4198,66,1,35,1,1,1,15,240.02
4,3,0,38,0,2393,58,2,33,1,1,1,15,145.805


In [5]:
# Create the y set from the "Churn" column
y = df['Churn']
y

0       0
1       0
2       0
3       0
4       0
       ..
3145    0
3146    0
3147    0
3148    0
3149    1
Name: Churn, Length: 3150, dtype: int64

In [18]:
# Split the data into training and testing sets using random_state=1
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [19]:
# Scale the X data by using StandardScaler()
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_train_scaled

array([[-1.0715273 ,  3.41042755, -2.78076233, ..., -0.57572071,
        -0.10596967, -0.83679706],
       [-0.0998478 , -0.29321837,  0.63628839, ..., -0.57572071,
        -0.10596967, -0.53841325],
       [-0.65509323, -0.29321837, -2.30944499, ..., -0.57572071,
        -0.10596967, -0.73208354],
       ...,
       [-0.0998478 , -0.29321837, -2.30944499, ..., -0.57572071,
         1.6084163 , -0.16953522],
       [-0.93271594, -0.29321837, -0.07068762, ..., -0.57572071,
        -0.67743166, -0.58547315],
       [-0.51628187, -0.29321837,  0.40062972, ...,  1.73695331,
        -0.10596967, -0.74623402]])

In [20]:
# Transform the test dataset based on the fit from the training dataset
X_test_scaled = scaler.transform(X_test)
X_test_scaled

array([[-1.0715273 , -0.29321837,  0.51845905, ...,  1.73695331,
        -0.67743166, -0.90540775],
       [-1.0715273 , -0.29321837, -1.0133223 , ..., -0.57572071,
        -0.10596967, -0.70921332],
       [-0.51628187, -0.29321837,  0.51845905, ...,  1.73695331,
        -0.10596967, -0.49221386],
       ...,
       [-0.37747051, -0.29321837, -0.18851696, ..., -0.57572071,
        -0.67743166,  0.964721  ],
       [-0.23865915, -0.29321837,  0.63628839, ..., -0.57572071,
        -0.67743166,  1.75732929],
       [-0.23865915, -0.29321837,  1.22543506, ..., -0.57572071,
         1.6084163 ,  1.59138628]])

In [21]:
# Create a `LogisticRegression` function and assign it 
# to a variable named `logistic_regression_model`.
lr = LogisticRegression()

# Fit the model
lr.fit(X_train_scaled, y_train)

In [22]:
# Score the model
print(f"Training Data Score: {lr.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {lr.score(X_test_scaled, y_test)}")

Training Data Score: 0.8966977138018628
Testing Data Score: 0.8934010152284264


In [23]:
# Alternatively, scale the data by using MinMaxScaler()
minmax_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = minmax_scaler.transform(X_train)
X_train_scaled

array([[0.        , 1.        , 0.13636364, ..., 0.        , 0.375     ,
        0.01657061],
       [0.19444444, 0.        , 0.79545455, ..., 0.        , 0.375     ,
        0.08863519],
       [0.08333333, 0.        , 0.22727273, ..., 0.        , 0.375     ,
        0.04186064],
       ...,
       [0.19444444, 0.        , 0.22727273, ..., 0.        , 0.75      ,
        0.17772528],
       [0.02777778, 0.        , 0.65909091, ..., 0.        , 0.25      ,
        0.07726945],
       [0.11111111, 0.        , 0.75      , ..., 1.        , 0.375     ,
        0.03844307]])

In [24]:
# Transform the test dataset based on the fit from the training dataset
X_test_scaled = minmax_scaler.transform(X_test)
X_test_scaled

array([[0.        , 0.        , 0.77272727, ..., 1.        , 0.25      ,
        0.        ],
       [0.        , 0.        , 0.47727273, ..., 0.        , 0.375     ,
        0.04738417],
       [0.11111111, 0.        , 0.77272727, ..., 1.        , 0.375     ,
        0.0997931 ],
       ...,
       [0.13888889, 0.        , 0.63636364, ..., 0.        , 0.25      ,
        0.45166676],
       [0.16666667, 0.        , 0.79545455, ..., 0.        , 0.25      ,
        0.64309466],
       [0.16666667, 0.        , 0.90909091, ..., 0.        , 0.75      ,
        0.6030167 ]])

In [25]:
# Create a `LogisticRegression` function and assign it 
# to a variable named `logistic_regression_model`.
lr2 = LogisticRegression()

# Fit the model
lr2.fit(X_train_scaled, y_train)

In [26]:
# Score the model
print(f"Training Data Score: {lr2.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {lr2.score(X_test_scaled, y_test)}")

Training Data Score: 0.8924640135478408
Testing Data Score: 0.8972081218274112


**Which scalar instance produces a better accuracy score on the scaled testing data?**

Answer: Both