In [1]:
# Copyright (C) 2016 Shane Lamont.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#            http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Shane test for customer machine learning. Test with big query. Project details to be confirmed.
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn import tree
from sklearn.externals.six import StringIO
# not available in google Datalab
# import pydot
import datalab.bigquery as bq


#import pd.io.gbq
# have a wider console
pd.set_option('display.width', 200)

# load the file & get features
cols = ['CustName','Segment','Month','TxType','AvgTxValue','TotTxVol','SumTxValue']
col_types = {'CustName': str, 'Segment': str, 'Month': int, 'TxType': str, 'AvgTxValue': float, 'TotTxVol': int, 'SumTxValue': float}

# in google cloud, query it from the table
query_string = 'SELECT CustName, Segment, MONTH (TxDate) Month, TxType\
  , avg(TxValue) as AvgTxValue\
  , Count(TxValue) as TotTxVol\
  , Sum(TxValue) as SumTxValue\
 FROM [CustomerDatabase_New.CustomerTable_300k]\
 GROUP BY 1,2,3,4\
 order by 1,2,3,4'

# now read it into a dataframe
# use this one for big query
customers = bq.Query(query_string).to_dataframe()

# use this one if not connected to google environment
# customers = pd.io.gbq.read_gbq(query_string, project_id = '713988583739')

# use this one for fully local scikit learn
# customers = pd.read_csv('CustomerTable_300k_ml_results-20160625.csv', header=1, names = cols, dtype=col_types)
# customers['TargetSegment'] = customers['Segment'].map(lambda x: segtonum(x))

# merge and do cross tab
customer_merge=pd.merge(left=customers[customers['TxType'] == 'CR'], right=customers[customers['TxType'] == 'DB']
            , on=['CustName','Segment','Month'], how='outer', indicator=True, suffixes=('CR','DB'))

# field names
all_cols = ['CustName', 'Segment', 'Month', 'TxTypeCR', 'AvgTxValueCR', 'TotTxVolCR', 'SumTxValueCR', 'TxTypeDB', 'AvgTxValueDB', 'TotTxVolDB', 'SumTxValueDB']
training_cols = ['AvgTxValueCR', 'TotTxVolCR', 'SumTxValueCR', 'AvgTxValueDB', 'TotTxVolDB', 'SumTxValueDB']
test_cols = training_cols
target_cols = ['Segment']

# take the last 1000 values as the test data and others data as training data
test_rows = 1000
customer_train_data = customer_merge[:-test_rows][training_cols].fillna(0)
customer_train_target = customer_merge[:-test_rows][target_cols].fillna(0)
customer_test_data = customer_merge[-test_rows:][test_cols].fillna(0)
customer_test_target = customer_merge[-test_rows:][target_cols]

# try just one row
# note that customer_test_data[4:5] has results
# ['AvgTxValueCR', 'TotTxVolCR', 'SumTxValueCR', 'AvgTxValueDB', 'TotTxVolDB', 'SumTxValueDB']
#  122929.891892        37.0     4,548,406.0  120045.083333        12.0     1440541.0

# now train the model
# use a decision tree to test
# try to predict one data point for the pdf
# clf.predict( [19.422961,331.0,6429.0,19.779661,118.0,2334.0])

# [250, 20, 3000, 128, 70, 400]
# display results

# initiate, train and predict from a Tree
clf = tree.DecisionTreeClassifier()
clf.fit(customer_train_data, customer_train_target)
predicted_results = clf.predict(customer_test_data)


# now print the last 100 customers ID, Actual Segment, Predicted Segment, Notification index on customer ID
comparison = 100
c_actual = customer_merge.tail(comparison)
c_test = customer_test_target.tail(comparison)
c_predicted = predicted_results[-comparison:]

for i in range(comparison):
    if c_test.iloc[i]['Segment'] !=  c_predicted[i]:
        print("Match fails: ", c_actual.iloc[i]['CustName'], "KYC Segment says:", c_actual.iloc[i]['Segment'], "Activity Segment suggests:", c_predicted[i])

# show the tree as evidence of decision

# not available in google Datalab
"""
dot_data = StringIO()
tree.export_graphviz(clf, out_file=dot_data,
                     feature_names=training_cols,
                     class_names=True,
                     filled=True, rounded=True,
                     special_characters=True
                     )
graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph.write_pdf("customer.pdf")
"""




('Match fails: ', 'C05000000', 'KYC Segment says:', 'Retail', 'Activity Segment suggests:', 'LowVolHighVal')
('Match fails: ', 'C05000001', 'KYC Segment says:', 'Retail', 'Activity Segment suggests:', 'HighVolLowVal')
('Match fails: ', 'C05000002', 'KYC Segment says:', 'Retail', 'Activity Segment suggests:', 'HighVolHighVal')
('Match fails: ', 'C05000003', 'KYC Segment says:', 'Retail', 'Activity Segment suggests:', 'HighVolLowVal')
('Match fails: ', 'C05000004', 'KYC Segment says:', 'Retail', 'Activity Segment suggests:', 'HighVolLowVal')


'\ndot_data = StringIO()\ntree.export_graphviz(clf, out_file=dot_data,\n                     feature_names=training_cols,\n                     class_names=True,\n                     filled=True, rounded=True,\n                     special_characters=True\n                     )\ngraph = pydot.graph_from_dot_data(dot_data.getvalue())\ngraph.write_pdf("customer.pdf")\n'