In [10]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

In [11]:
df = pd.read_excel("ANZ synthesised transaction dataset.xlsx")

In [12]:
df_salaries = df[df["txn_description"] == "PAY/SALARY"].groupby("customer_id").mean()

In [13]:
salaries = []
for customer_id in df["customer_id"]:
    salaries.append(int(df_salaries.loc[customer_id]["amount"]))
df["annual_salary"] = salaries

In [14]:
df_cus = df.groupby("customer_id").mean()
print("Mean annual salary by customer: ")
print(df_cus.head(), "\n")

Mean annual salary by customer: 
                card_present_flag  merchant_code       balance  age  \
customer_id                                                           
CUS-1005756958           0.812500            0.0   2275.852055   53   
CUS-1117979751           0.826923            0.0   9829.929000   21   
CUS-1140341822           0.815385            0.0   5699.212250   28   
CUS-1147642491           0.750000            0.0   9032.841186   34   
CUS-1196156254           0.785276            0.0  22272.433755   34   

                    amount  annual_salary  
customer_id                                
CUS-1005756958  222.862603            970  
CUS-1117979751  339.843700           3578  
CUS-1140341822  212.632500           1916  
CUS-1147642491  245.600169           1711  
CUS-1196156254  147.145796           3903   



In [17]:
# Linear regression
print("LINEAR REGRESSION:\n")
N_train = int(len(df_cus)*0.8)
X_train = df_cus.drop("annual_salary", axis=1).iloc[:N_train]
Y_train = df_cus["annual_salary"].iloc[:N_train]
X_test = df_cus.drop("annual_salary", axis=1).iloc[N_train:]
Y_test = df_cus["annual_salary"].iloc[N_train:]
linear_reg = LinearRegression()
linear_reg.fit(X_train, Y_train)
print(f"Linear Regression Training Score: {linear_reg.score(X_train, Y_train)}\n")

LINEAR REGRESSION:

Linear Regression Training Score: 0.23295376366257825



In [18]:
print("Predictions using test data:")
print(linear_reg.predict(X_test), "\n")

Predictions using test data:
[1993.98473311 2867.39066481 1944.95959591 1806.85984885 2226.35045442
 2075.34697175 1813.02987337 5388.67435983 1902.35351608 2191.90445145
 1713.48134178 2854.40519949 2094.77781158 3815.34342881 2249.92922822
 1768.80816189 2095.02988288 1515.18425875 1782.72752537 2481.2898546 ] 



In [19]:
# Decision tree - classification and regression
# Categorical columns
df_cat = df[["txn_description", "gender", "age", "merchant_state", "movement"]]
# Changing all categories to dummies
pd.get_dummies(df_cat).head()

Unnamed: 0,age,txn_description_INTER BANK,txn_description_PAY/SALARY,txn_description_PAYMENT,txn_description_PHONE BANK,txn_description_POS,txn_description_SALES-POS,gender_F,gender_M,merchant_state_ACT,merchant_state_NSW,merchant_state_NT,merchant_state_QLD,merchant_state_SA,merchant_state_TAS,merchant_state_VIC,merchant_state_WA,movement_credit,movement_debit
0,26,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1
1,26,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,1
2,38,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1
3,40,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,1
4,26,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,1


In [20]:
N_train = int(len(df)*0.8)
X_train = pd.get_dummies(df_cat).iloc[:N_train]
Y_train = df["annual_salary"].iloc[:N_train]
X_test = pd.get_dummies(df_cat).iloc[N_train:]
Y_test = df["annual_salary"].iloc[N_train:]

In [21]:
# Classification
print("DECISION TREE - CLASSIFIER:\n")
decision_tree_class = DecisionTreeClassifier()
decision_tree_class.fit(X_train, Y_train)
print(f"Decision Tree Classifier Training Score: {decision_tree_class.score(X_train, Y_train)}\n")

DECISION TREE - CLASSIFIER:

Decision Tree Classifier Training Score: 0.7882499481004774



In [22]:
print("Predictions using test data:")
print(decision_tree_class.predict(X_test), "\n")

print(f"Decision Tree Classifier Testing Score: {decision_tree_class.score(X_test, Y_test)}\n")

Predictions using test data:
[1013 1043 4132 ... 4054 1043  996] 

Decision Tree Classifier Testing Score: 0.7546699875466999



In [23]:
# Regression
print("DECISION TREE - REGRESSOR:\n")
decision_tree_reg = DecisionTreeRegressor()
decision_tree_reg.fit(X_train, Y_train)
print(f"Decision Tree Regressor Training Score: {decision_tree_reg.score(X_train, Y_train)}\n")


DECISION TREE - REGRESSOR:

Decision Tree Regressor Training Score: 0.7468978726536879



In [24]:
print("Predictions using test data:")
print(decision_tree_reg.predict(X_test), "\n")

Predictions using test data:
[1226.42857143 1043.         4132.         ... 3345.04761905 1043.
 1626.        ] 



In [25]:
print(f"Decision Tree Regressor Testing Score: {decision_tree_reg.score(X_test, Y_test)}\n")

Decision Tree Regressor Testing Score: 0.6830121416287859

