In [55]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from imblearn.pipeline import Pipeline, make_pipeline

from imblearn.over_sampling import SMOTE

In [2]:
# load data
df = pd.read_csv('../data/churn_data.csv')
df.shape

(3333, 21)

In [3]:
df.head()

Unnamed: 0,state,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,...,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.9,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False


In [4]:
# remove unique identifiers 
df.pop('phone number')
df.pop('area code')
df.columns

Index(['state', 'account length', 'international plan', 'voice mail plan',
       'number vmail messages', 'total day minutes', 'total day calls',
       'total day charge', 'total eve minutes', 'total eve calls',
       'total eve charge', 'total night minutes', 'total night calls',
       'total night charge', 'total intl minutes', 'total intl calls',
       'total intl charge', 'customer service calls', 'churn'],
      dtype='object')

In [5]:
# check column types
df.dtypes

state                      object
account length              int64
international plan         object
voice mail plan            object
number vmail messages       int64
total day minutes         float64
total day calls             int64
total day charge          float64
total eve minutes         float64
total eve calls             int64
total eve charge          float64
total night minutes       float64
total night calls           int64
total night charge        float64
total intl minutes        float64
total intl calls            int64
total intl charge         float64
customer service calls      int64
churn                        bool
dtype: object

In [6]:
df['international plan'].unique()

array(['no', 'yes'], dtype=object)

In [7]:
# this function takes a dataframe and a column name within that dataframe and converts values to int's 0 and 1
# inputs: data (dataframe), column (string)
# output: data[column] as column of 0's and 1's
# note, operations NOT done inplace

def convert_to_binary(data, column, yes_list=['yes', 'Yes', 'YES', '1']):
    
    if len(data[column].unique()) <= 2:
        
        if data[column].dtype == bool:
            data[column] = data[column].astype(int)
            return data[column]
        
        elif data[column].dtype == object or data[column].dtype == str:
            yes_list = ['yes', 'Yes', 'YES', '1']

            for yes in yes_list:
                if yes in data[column].unique():
                    data[column] = data[column].eq(yes).mul(1)
                    return data[column]
                
                else:
                    return print('Entries could not be converted binary. Check entries or try custom list for keyword argument "yes_list".')
                
        else:
            return print("Entries could not be converted binary, confirm check for NA's or consider OneHotEncoding.")
        
    else:
        return print("Entries could not be converted binary, more than 2 entry types.")

binary_column_list = ['international plan', 'voice mail plan', 'churn']

for column in binary_column_list:
    print(column)
    df[column] = convert_to_binary(df, column)

international plan
voice mail plan
churn


In [8]:
df['churn'].head()

0    0
1    0
2    0
3    0
4    0
Name: churn, dtype: int32

In [9]:
df['international plan'].head()

0    0
1    0
2    0
3    1
4    1
Name: international plan, dtype: int32

In [10]:
# Code below replaced with function "convert_to_binary"
# # clean data
# # convert binaries
# churn_dict = {False: 0, True: 1}
# yes_no_dict = {'no': 0, 'yes': 1}
# df['churn'].replace(churn_dict, inplace=True)
# df['international plan'].replace(yes_no_dict, inplace=True)
# df['voice mail plan'].replace(yes_no_dict, inplace=True)
# df.head()

In [11]:
# split data into target and predictors
y = df.churn
X = df.drop('churn', axis=1)
X.head()

Unnamed: 0,state,account length,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls
0,KS,128,0,1,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1
1,OH,107,0,1,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1
2,NJ,137,0,0,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0
3,OH,84,1,0,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2
4,OK,75,1,0,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3


In [12]:
len(X.state.unique().sum())/2

51.0

In [35]:
# from sklearn.model_selection import train_test_split
# from imblearn.over_sampling import SMOTE
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2021)

In [14]:
# ohe_column_list = ['state']
# X_train_cat = X[ohe_column_list]
# X_train_cat.head()

In [15]:
# X_train_num = X.drop(ohe_column_list, axis=1)
# X_train_num.head()

In [16]:
# This code will onehotencode the categorical columns and return the dataframe
# inputs: X (dataframe), column_list (list of categorical columns)
# returns: X_ohe (one-hot-encoded dataframe)
# note, operations NOT done inplace

def ohe2021(X, column_list):  
    # One hot encode
    valid_columns = []
    
    # remove columns not in X dataframe
    for column in column_list:
        if column in X.columns:
            valid_columns.append(column)
        
        else:
            print(f'Column "{column}" not found in DataFrame')
            
    # get dataframe of categroical columns to oneHotEncode
    X_cat = X[valid_columns]
    X_cat.head()

    # get dataframe of numeric columns to oneHotEncode
    X_num = X.drop(valid_columns, axis=1)
    X_num.head()

    # Finally, One Hot Encode
    # from sklearn.preprocessing import OneHotEncoder
    ohe = OneHotEncoder(sparse=False, drop='first')
    X_cat_oh = pd.DataFrame(ohe.fit_transform(X_cat), index= X_cat.index, columns=ohe.get_feature_names(X_cat.columns))
    X_cat_oh.head()

    # sandwich OHE df with numerical dataframes
    X_ohe = X_num.join(X_cat_oh)
    return X_ohe

ohe_column_list = ['state', 'area code']
X_train = ohe2021(X_train, ohe_column_list)
X_train.head()

Column "area code" not found in DataFrame


Unnamed: 0,account length,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,...,state_SD,state_TN,state_TX,state_UT,state_VA,state_VT,state_WA,state_WI,state_WV,state_WY
561,53,0,1,18,146.8,107,24.96,310.0,84,26.35,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1764,77,0,0,0,245.2,87,41.68,254.1,83,21.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1733,130,0,0,0,124.3,70,21.13,270.7,99,23.01,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3168,201,0,0,0,225.9,110,38.4,299.1,86,25.42,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
562,99,0,1,28,200.7,88,34.12,264.2,116,22.46,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [17]:
X_train.shape

(2666, 67)

In [18]:
y_train.shape

(2666,)

In [19]:
# scale data
# function scales dataframe using StandardScaler
# inputs: X (dataframe) 
# returns: X_scaled (dataframe) 
# note, operations NOT done inplace

def scale2021(X):
    ss = StandardScaler()
    X_scaled = pd.DataFrame(ss.fit_transform(X))
    X_scaled.index = X.index
    X_scaled.columns = X.columns
    return X_scaled

X_train_scaled = scale2021(X_train)
X_train_scaled.head()

Unnamed: 0,account length,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,...,state_SD,state_TN,state_TX,state_UT,state_VA,state_VT,state_WA,state_WI,state_WV,state_WY
561,-1.215926,-0.328029,1.620867,0.71995,-0.594966,0.32627,-0.594598,2.150149,-0.806009,2.150108,...,-0.131031,-0.126515,-0.147809,-0.160559,-0.156833,-0.151736,-0.133962,-0.151736,-0.185839,-0.153025
1764,-0.613634,-0.328029,-0.616954,-0.591007,1.193361,-0.660331,1.192879,1.045323,-0.856365,1.04563,...,-0.131031,-0.126515,-0.147809,-0.160559,-0.156833,-0.151736,-0.133962,-0.151736,-0.185839,-0.153025
1733,0.716426,-0.328029,-0.616954,-0.591007,-1.003882,-1.498942,-1.00405,1.373411,-0.050659,1.373485,...,-0.131031,-0.126515,-0.147809,-0.160559,-0.156833,-0.151736,7.464811,-0.151736,-0.185839,-0.153025
3168,2.498205,-0.328029,-0.616954,-0.591007,0.842602,0.474261,0.842226,1.934718,-0.705295,1.933863,...,-0.131031,-0.126515,6.765496,-0.160559,-0.156833,-0.151736,-0.133962,-0.151736,-0.185839,-0.153025
562,-0.061534,-0.328029,1.620867,1.448259,0.384616,-0.611001,0.384666,1.244943,0.805404,1.245598,...,-0.131031,-0.126515,-0.147809,-0.160559,-0.156833,-0.151736,-0.133962,-0.151736,-0.185839,6.5349


In [20]:
X_train_resamp, y_train_resamp = SMOTE().fit_resample(X_train_scaled, y_train)

In [21]:
X_train_resamp.shape

(4540, 67)

In [22]:
y_train_resamp.sum()

2270

In [23]:
# FSM
logreg = LogisticRegression()
logreg.fit(X_train_resamp, y_train_resamp)
logreg.score(X_train_scaled, y_train)

0.7895723930982745

In [24]:
X_test = ohe2021(X_test, ohe_column_list)

Column "area code" not found in DataFrame


In [25]:
# scale data
X_test_scaled = scale2021(X_test)
X_test_scaled
X_test_scaled.head()

Unnamed: 0,account length,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,...,state_SD,state_TN,state_TX,state_UT,state_VA,state_VT,state_WA,state_WI,state_WV,state_WY
1396,1.020963,3.069507,1.602138,1.403874,0.156787,1.215621,0.156387,-0.765845,0.072073,-0.765108,...,-0.151678,-0.129492,-0.151678,-0.086907,-0.140988,-0.140988,-0.171234,-0.166538,-0.161722,-0.156772
286,0.945304,-0.325785,-0.624166,-0.59489,0.492474,1.11157,0.492637,-1.409263,0.072073,-1.409332,...,-0.151678,-0.129492,-0.151678,-0.086907,-0.140988,-0.140988,-0.171234,-0.166538,-0.161722,-0.156772
1277,-0.794856,-0.325785,1.602138,1.181789,-0.090662,0.903467,-0.090724,1.522296,0.766623,1.521888,...,-0.151678,-0.129492,-0.151678,-0.086907,-0.140988,-0.140988,-0.171234,-0.166538,-0.161722,-0.156772
3001,0.44091,-0.325785,-0.624166,-0.59489,-1.184042,-0.969455,-1.184101,-0.245635,0.369738,-0.245127,...,-0.151678,-0.129492,-0.151678,-0.086907,-0.140988,-0.140988,-0.171234,-0.166538,-0.161722,-0.156772
13,-0.113924,-0.325785,-0.624166,-0.59489,-0.462795,-0.657301,-0.463081,0.929726,-1.267417,0.930582,...,-0.151678,-0.129492,-0.151678,-0.086907,-0.140988,-0.140988,-0.171234,-0.166538,-0.161722,-0.156772


In [26]:
logreg.score(X_test_scaled, y_test)

0.7721139430284858

In [28]:
smoted_scaled_pipeline = Pipeline([('ohe', OneHotEncoder(dtype=object)),
                                   ('ss', StandardScaler()),
                                   ('smote', SMOTE()),
                                   ('logreg', LogisticRegression())])

In [47]:
scaled_pipeline = Pipeline([('ss', StandardScaler()),
                         ('logreg', LogisticRegression())])

In [46]:
cross_val_score(scaled_pipeline, X_train, y_train) # works when onehotencoder excluded from pipeline

NameError: name 'scaled_pipeline' is not defined

In [28]:
columnTransform = ColumnTransformer([('encoder', OneHotEncoder(), ['state'])], remainder='passthrough')

In [33]:
X_train = columnTransform.fit_transform(X_train)

In [36]:
X_train.head()

Unnamed: 0,state,account length,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls
561,RI,53,0,1,18,146.8,107,24.96,310.0,84,26.35,178.7,130,8.04,7.2,7,1.94,0
1764,CT,77,0,0,0,245.2,87,41.68,254.1,83,21.6,239.4,91,10.77,7.5,4,2.03,0
1733,WA,130,0,0,0,124.3,70,21.13,270.7,99,23.01,239.5,83,10.78,3.5,6,0.95,0
3168,TX,201,0,0,0,225.9,110,38.4,299.1,86,25.42,251.3,81,11.31,11.2,4,3.02,1
562,WY,99,0,1,28,200.7,88,34.12,264.2,116,22.46,172.7,102,7.77,9.1,5,2.46,1


In [44]:
my_column_trans = make_column_transformer((OneHotEncoder(), ['state']))

In [45]:
smoted_scaled_pipeline = Pipeline([('col_transformer', my_column_trans),
                                   ('ss', StandardScaler()),
                                   ('smote', SMOTE()),
                                   ('logreg', LogisticRegression())])

In [43]:
cross_val_score(smoted_scaled_pipeline, X_train, y_train) # works when onehotencoder excluded from pipeline

Traceback (most recent call last):
  File "C:\Users\samjd\anaconda3\envs\oy-env\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\samjd\anaconda3\envs\oy-env\lib\site-packages\imblearn\pipeline.py", line 277, in fit
    Xt, yt, fit_params = self._fit(X, y, **fit_params)
  File "C:\Users\samjd\anaconda3\envs\oy-env\lib\site-packages\imblearn\pipeline.py", line 229, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "C:\Users\samjd\anaconda3\envs\oy-env\lib\site-packages\joblib\memory.py", line 352, in __call__
    return self.func(*args, **kwargs)
  File "C:\Users\samjd\anaconda3\envs\oy-env\lib\site-packages\sklearn\pipeline.py", line 740, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "C:\Users\samjd\anaconda3\envs\oy-env\lib\site-packages\sklearn\base.py", line 693, in fit_transform
    return self.fit(X, y, **fit_params).tran

array([nan, nan, nan, nan, nan])

In [48]:
preprocessing = make_column_transformer((OneHotEncoder(), make_column_selector(dtype_include=object)),
                                       (StandardScaler(), make_column_selector(dtype_include=np.number),
                                        SMOTE()))
#preprocessing = make_column_transformer((OneHotEncoder(), make_column_selector(dtype_include=object)),
                                                # StandardScaler(),
                                                # SMOTE(),
                                                # LogisticRegression())
preprocessing

ColumnTransformer(transformers=[('onehotencoder', OneHotEncoder(),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x000001E87D5CEAC0>),
                                ('standardscaler', StandardScaler(),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x000001E87D5CE070>)])

In [49]:
preprocessing.fit_transform(X_train)

<2666x68 sparse matrix of type '<class 'numpy.float64'>'
	with 47988 stored elements in Compressed Sparse Row format>

In [57]:
dt_pipeline = make_pipeline(preprocessing, DecisionTreeClassifier(random_state=2021))
rf_pipeline = make_pipeline(preprocessing, RandomForestClassifier(random_state=2021))

In [58]:
cross_val_score(dt_pipeline, X_train, y_train)

array([0.90449438, 0.89681051, 0.91744841, 0.92120075, 0.93058161])