In [1]:
# Import refill data
import numpy as np
import pandas as pd
refill_data = pd.read_csv('refill_data.csv')
refill_data.describe()

Unnamed: 0,RefillNumber,RefillAmount,AdvancedNotified,KitPurchasePrice,CustomerIsCorporate,PreviousRefillsAmount,DaysSinceLastRefill,DaysSinceKitPurchase
count,47755.0,47755.0,47755.0,35350.0,47755.0,47755.0,44667.0,47755.0
mean,8.749702,223.675246,0.461397,376.732818,0.133808,1684.394852,120.098731,1914.410931
std,5.658008,164.792918,0.498513,315.807762,0.340449,1131.699327,106.832913,1807.472788
min,1.0,29.45,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,75.5,0.0,120.8,0.0,703.44,63.0,943.0
50%,8.0,141.7,0.0,333.35,0.0,1686.75,99.0,1412.0
75%,12.0,322.61,1.0,533.49,0.0,2351.93,154.0,1947.0
max,31.0,1194.06,1.0,3144.95,1.0,7272.91,2872.0,42464.0


In [2]:
# Separate data into features (X) and target (y)
X_raw = refill_data[list(refill_data.columns[:-1])]
y_raw = refill_data[refill_data.columns[-1]]
print X_raw.shape
print y_raw.shape

(47755, 10)
(47755,)


In [3]:
# Investigate problem severity
from IPython.display import display
print "Total refills: {}".format(y_raw.count())
print "Service level changes within 45 days of refill:"
display(100*y_raw.value_counts()/y_raw.count())

Total refills: 47755
Service level changes within 45 days of refill:


No change      97.742645
Downgraded      1.972568
Deactivated     0.284787
Name: ServiceLevelChange, dtype: float64

In [4]:
# Preprocess features
# Split categorical features into multiple binary columns
X_all = pd.DataFrame(index=X_raw.index)

for col, col_data in X_raw.iteritems():
    if col_data.dtype == object:
        col_data = pd.get_dummies(col_data, prefix=col)
    X_all = X_all.join(col_data)

# TODO: Load the resulting data into a sparse matrix
X_all.head()

Unnamed: 0,RefillNumber,RefillAmount,AdvancedNotified,KitType_A,KitType_B,KitType_C,KitType_D,KitType_E,KitType_F,KitType_G,...,KitType_S,KitType_T,KitPurchasePrice,KitPurchaseSalesPerson_DEALER,KitPurchaseSalesPerson_DIRECT,KitPurchaseSalesPerson_WEB,CustomerIsCorporate,PreviousRefillsAmount,DaysSinceLastRefill,DaysSinceKitPurchase
0,18,255.5,1,0,0,0,0,0,0,1,...,0,0,,0,0,0,0,3789.89,250,3217
1,23,323.4,1,0,0,0,0,0,0,0,...,0,0,,0,0,0,0,5302.89,94,3163
2,18,336.8,0,0,0,0,0,0,0,1,...,0,0,,0,0,0,0,3477.8,188,3121
3,19,75.33,1,0,0,0,0,0,0,1,...,0,0,,0,0,0,0,3814.6,71,3192
4,4,49.45,0,0,0,0,0,0,0,0,...,0,0,,0,0,0,1,819.58,123,3154


In [5]:
# Encode target labels
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_all = le.fit_transform(y_raw)
results = le.classes_
print results
print le.transform(results)

['Deactivated' 'Downgraded' 'No change']
[0 1 2]


In [7]:
# Convert data to sparse matrix
X_all = X_all.to_sparse(fill_value=-1)


In [20]:
# Train classifier
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier

results = []
classifiers = [
    LinearSVC(dual=False),
    DecisionTreeClassifier(),
    KNeighborsClassifier(),
    AdaBoostClassifier(),
    RandomForestClassifier()
]
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, stratify=y_all)
for clf in classifiers:
    for i in range(10):
        classifier = clf.__class__.__name__
        clf.fit(X_train, y_train)
        train_f1 = clf.score(X_train, y_train)
        test_f1 = clf.score(X_test, y_test)
        results.append([classifier, train_f1, test_f1])
    
results = pd.DataFrame(
    data = results,
    columns = [
        'classifier',
        'train_f1',
        'test_f1'
    ]
)

display(results.groupby('classifier').mean())

Unnamed: 0_level_0,train_f1,test_f1
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1
AdaBoostClassifier,0.977411,0.977487
DecisionTreeClassifier,0.99665,0.955225
KNeighborsClassifier,0.977804,0.977487
LinearSVC,0.977411,0.977487
RandomForestClassifier,0.994566,0.973005


In [None]:
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO  
import pydot
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph.write_pdf("customer_cancellation.pdf")