In [1]:
# test harness and baseline model evaluation for the german credit dataset
from collections import Counter
from numpy import mean
from numpy import std
from pandas import read_csv
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import fbeta_score
from sklearn.metrics import make_scorer
from sklearn.dummy import DummyClassifier

# load the dataset
def load_dataset(full_path):
	# load the dataset as a numpy array
	dataframe = read_csv(full_path, header=None)
	# split into inputs and outputs
	last_ix = len(dataframe.columns) - 1
	X, y = dataframe.drop(last_ix, axis=1), dataframe[last_ix]
	# select categorical features
	cat_ix = X.select_dtypes(include=['object', 'bool']).columns
	# one hot encode cat features only
	ct = ColumnTransformer([('o',OneHotEncoder(),cat_ix)], remainder='passthrough')
	X = ct.fit_transform(X)
	# label encode the target variable to have the classes 0 and 1
	y = LabelEncoder().fit_transform(y)
	return X, y

# calculate f2 score
def f2(y_true, y_pred):
	return fbeta_score(y_true, y_pred, beta=2)

# evaluate a model
def evaluate_model(X, y, model):
	# define evaluation procedure
	cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
	# define the model evaluation metric
	metric = make_scorer(f2)
	# evaluate model
	scores = cross_val_score(model, X, y, scoring=metric, cv=cv, n_jobs=-1)
	return scores

# define the location of the dataset
full_path = 'WA_Fn-UseC_-Telco-Customer-Churn.csv'
# load the dataset
X, y = load_dataset(full_path)
# summarize the loaded dataset
print(X.shape, y.shape, Counter(y))
# define the reference model
model = DummyClassifier(strategy='constant', constant=1)
# evaluate the model
scores = evaluate_model(X, y, model)
# summarize performance
print('Mean F2: %.3f (%.3f)' % (mean(scores), std(scores)))

(7044, 15295) (7044,) Counter({1: 5174, 2: 1869, 0: 1})




Mean F2: nan (nan)


Traceback (most recent call last):
  File "/Users/yanhanjun/Library/Python/3.9/lib/python/site-packages/sklearn/metrics/_scorer.py", line 136, in __call__
    score = scorer._score(
  File "/Users/yanhanjun/Library/Python/3.9/lib/python/site-packages/sklearn/metrics/_scorer.py", line 355, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
  File "/var/folders/8t/3xrvdszj1g75llc2jb40dnvw0000gn/T/ipykernel_10918/985962909.py", line 33, in f2
  File "/Users/yanhanjun/Library/Python/3.9/lib/python/site-packages/sklearn/utils/_param_validation.py", line 211, in wrapper
    return func(*args, **kwargs)
  File "/Users/yanhanjun/Library/Python/3.9/lib/python/site-packages/sklearn/metrics/_classification.py", line 1411, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/Users/yanhanjun/Library/Python/3.9/lib/python/site-packages/sklearn/utils/_param_validation.py", line 184, in wrapper
    return func(*args, **kwargs)
  File "/Users/yanha

In [2]:
# load and summarize the dataset
from pandas import read_csv
from collections import Counter
# define the dataset location
filename = 'WA_Fn-UseC_-Telco-Customer-Churn.csv'
# load the csv file as a data frame
dataframe = read_csv(filename, header=None, na_values='?')
# drop rows with missing
dataframe = dataframe.dropna()
# summarize the shape of the dataset
print(dataframe.shape)
# summarize the class distribution
target = dataframe.values[:,-1]
counter = Counter(target)
for k,v in counter.items():
	per = v / len(target) * 100
	print('Class=%s, Count=%d, Percentage=%.3f%%' % (k, v, per))

(7044, 21)
Class=Churn, Count=1, Percentage=0.014%
Class=No, Count=5174, Percentage=73.453%
Class=Yes, Count=1869, Percentage=26.533%


In [5]:
# create histograms of numeric input variables
from pandas import read_csv
from matplotlib import pyplot
# define the dataset location
filename = 'WA_Fn-UseC_-Telco-Customer-Churn.csv'
# load the csv file as a data frame
df = read_csv(filename, header=None, na_values='?')
# drop rows with missing
df = df.dropna()
# select columns with numerical data types
num_ix = df.select_dtypes(include=['int64', 'float64']).columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7044 entries, 0 to 7043
Data columns (total 21 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       7044 non-null   object
 1   1       7044 non-null   object
 2   2       7044 non-null   object
 3   3       7044 non-null   object
 4   4       7044 non-null   object
 5   5       7044 non-null   object
 6   6       7044 non-null   object
 7   7       7044 non-null   object
 8   8       7044 non-null   object
 9   9       7044 non-null   object
 10  10      7044 non-null   object
 11  11      7044 non-null   object
 12  12      7044 non-null   object
 13  13      7044 non-null   object
 14  14      7044 non-null   object
 15  15      7044 non-null   object
 16  16      7044 non-null   object
 17  17      7044 non-null   object
 18  18      7044 non-null   object
 19  19      7044 non-null   object
 20  20      7044 non-null   object
dtypes: object(21)
memory usage: 1.1+ MB
