In [1]:
# -------------------------- LIBRARIES NECESSARY IN THIS PROJECT  -------------------------- #

import pandas as pd
import numpy as np
import seaborn as sns
from datetime import *
import zipfile
import pylab as plt

import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

from sklearn.tree import DecisionTreeClassifier

from sklearn import tree
from pydot import graph_from_dot_data
from six import StringIO

from statistics import mean, stdev

from IPython.display import Image

from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.model_selection import GroupShuffleSplit

from sklearn.metrics import accuracy_score

from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import StratifiedShuffleSplit

# -------------------------- FUNCTIONS -------------------------- #

def getDataSet(data_path):
    return pd.read_csv(data_path)

## Upload dataSets

### Members dataset cleasing

In [2]:
# -------------------------- PATH OF THE DATASETS USED IN THE PROJECT  -------------------------- #
zf = zipfile.ZipFile('../dataSets/main.zip')

main_data_set_all_fields = pd.read_csv(zf.open('main.csv'))

## Data manipulation and cleansing

In [3]:
# Check the dataSet fields
main_data_set_all_fields.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32950 entries, 0 to 32949
Data columns (total 26 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              32950 non-null  int64  
 1   member_key              32950 non-null  object 
 2   updated_at              32950 non-null  object 
 3   first_name              32950 non-null  object 
 4   last_name               32950 non-null  object 
 5   disabilities            8383 non-null   object 
 6   SA1                     3751 non-null   float64
 7   price_zone_code         32950 non-null  object 
 8   plan_key                32950 non-null  object 
 9   plan_status             32950 non-null  object 
 10  plan_start_date         32950 non-null  object 
 11  plan_end_date           32950 non-null  object 
 12  budget_level3_name      840 non-null    object 
 13  budget_level3_key       840 non-null    object 
 14  budget_level2_key       32950 non-null

In [4]:
main_data_set_all_fields.head(4)

Unnamed: 0.1,Unnamed: 0,member_key,updated_at,first_name,last_name,disabilities,SA1,price_zone_code,plan_key,plan_status,...,budget_level1_key,budget_level1_name,invoice_state,claim_state,allocated_amount,requested_amount,funded_amount,allocated_amount_total,requested_amount_total,funded_amount_total
0,0,00109970-7029-11eb-81d6-9d4df94b6224,2021-04,Wynny,Pressman,,,ACT_NSW_QLD_VIC,588427f1-e45d-4a6e-b307-9c68a13849ef,PLAN_DELIVERY_ACTIVE,...,58745466-f597-11e9-bfb4-022d4762bb3c,Core,ALL_PAID,PAID,50000.0,98.8,98.8,272168.34,11316.7,11316.7
1,1,00109970-7029-11eb-81d6-9d4df94b6224,2021-04,Wynny,Pressman,,,ACT_NSW_QLD_VIC,588427f1-e45d-4a6e-b307-9c68a13849ef,PLAN_DELIVERY_ACTIVE,...,58745466-f597-11e9-bfb4-022d4762bb3c,Core,ALL_PAID,PAID,222168.34,11217.9,11217.9,272168.34,11316.7,11316.7
2,2,0010f240-3b5d-11eb-aa73-7be7bbd3c758,2021-04,Beryl,McDuff,Asperger syndrome,3138648.0,ACT_NSW_QLD_VIC,ef286fe8-8b36-4d77-97a0-59f9ef3df9b8,PLAN_DELIVERY_ACTIVE,...,5874551b-f597-11e9-bfb4-022d4762bb3c,Capacity Building,ALL_PAID,PAID,9801.6,2559.6384,2559.67,9801.6,2559.6384,2559.67
3,3,00117a50-cbc6-11ea-805a-75fc51d39a77,2021-04,Jethro,Perchard,,,ACT_NSW_QLD_VIC,8aa26238-d922-4963-bcaf-427d24a610bc,COMPLETED,...,5874551b-f597-11e9-bfb4-022d4762bb3c,Capacity Building,ALL_PAID,PAID,64524.0,8503.65,8503.65,64524.0,8503.65,8503.65


In [5]:
main_data_No_NaN = main_data_set_all_fields.replace(np.nan, 'NA')

main_data_No_NaN = main_data_No_NaN[(main_data_No_NaN["plan_status"] == 'PLAN_DELIVERY_ACTIVE')]

In [6]:
# Select just the fields to be used
main_data_set = main_data_No_NaN.groupby([
                                           "member_key"
                                         , "first_name"
                                         , "last_name"
                                         , "budget_level1_key"
                                         , "budget_level1_name"
                                         , "budget_level2_key"
                                         , "budget_level2_name"
                                         , "plan_start_date"
                                         , "plan_end_date"
                                         , "allocated_amount_total"
                                        ]).agg({"funded_amount": "sum", "allocated_amount": "sum"}).reset_index().copy()

In [7]:
months_total = main_data_set[{
                              "member_key"
                            , "plan_start_date"
                            , "plan_end_date"}].copy()

months_total['plan_months_total'] = ((pd.to_datetime(months_total['plan_end_date']).dt.year
                                   -  pd.to_datetime(months_total['plan_start_date']).dt.year) * 12) + (pd.to_datetime(months_total['plan_end_date']).dt.month
                                   -  pd.to_datetime(months_total['plan_start_date']).dt.month)

months_total_merged = months_total[{"member_key", "plan_months_total"}]
                            
months_total_merged = months_total[{
                                    "member_key"
                                  , "plan_months_total"
                            }].groupby([
                                       "member_key"
                                     , "plan_months_total"
                                       ]).count().reset_index()

months_total_merged = pd.merge(main_data_set, months_total_merged, how="left", on=["member_key"])

In [8]:
allocated_amount_month = months_total_merged[{
                                              "member_key"
                                            , "budget_level1_key"
                                            , "budget_level2_key"
                                            , "plan_months_total"
                                            , "allocated_amount"
                                            }].copy()

allocated_amount_month["allocated_amount_month"] = allocated_amount_month["allocated_amount"] / allocated_amount_month["plan_months_total"]                                                    

allocated_amount_month = allocated_amount_month[{'member_key', 'budget_level1_key', 'budget_level2_key', 'allocated_amount_month'}]

months_total_merged_monthly = pd.merge(months_total_merged, allocated_amount_month, how="left", on=["member_key", "budget_level1_key", "budget_level2_key"])

In [9]:
month_today = main_data_set[{
                           "member_key"
                         , "budget_level1_key"
                         , "budget_level2_key"
                         , "plan_start_date"
                            }].copy()


month_today['month_actual'] = ((pd.to_datetime("today").year -  pd.to_datetime(month_today['plan_start_date']).dt.year) * 12)                                                                                                                     + (pd.to_datetime("today").month - pd.to_datetime(month_today['plan_start_date']).dt.month)

month_today = month_today[{'member_key', 'budget_level1_key', 'budget_level2_key', 'month_actual'}]

month_actual = pd.merge(months_total_merged_monthly, month_today, how="left", on=["member_key", "budget_level1_key", "budget_level2_key"])

month_actual["spent_amount_predicted"] = month_actual["allocated_amount_month"] * month_actual["month_actual"]

In [10]:
month_actual['funded_amount'] = month_actual['funded_amount'].round(2)
month_actual['allocated_amount'] = month_actual['allocated_amount'].round(2)

month_actual['allocated_amount_month'] = month_actual['allocated_amount_month'].round(2)
month_actual['spent_amount_predicted'] = month_actual['spent_amount_predicted'].round(2)

month_actual['months_left'] = month_actual['plan_months_total'] - month_actual['month_actual']

In [11]:
month_actual['funded_amount_percentage'] = (month_actual['funded_amount'] / month_actual['allocated_amount']) * 100#).round(2)

month_actual['months_left_percentage'] = (month_actual['months_left'] / month_actual['plan_months_total']) * 100#).round(2)

month_actual['funded_amount_months_left'] = month_actual['funded_amount_percentage'] / month_actual['months_left_percentage']#).round(2)

month_actual = month_actual.loc[
                                (month_actual["plan_months_total"] >= 12)
                              & (month_actual["allocated_amount"] > 0)
                              & (month_actual["months_left"] > 0)
                               ]

In [12]:
main_data_set = month_actual[{
                              "member_key"
                            , "first_name" 
                            , "last_name" 
                            , "budget_level1_key"
                            , "budget_level1_name"
                            , "budget_level2_key"
                            , "budget_level2_name"
                            , "plan_months_total"
                            , "allocated_amount"
                            , "funded_amount_months_left"
                            }].copy()

In [13]:
 main_data_set['spending_status'] = np.where(main_data_set['funded_amount_months_left'] < 0.9, 'Overspending', 'Underspending')
 main_data_set['spending_status'] = np.where(((main_data_set['funded_amount_months_left'] >= 0.9) & (main_data_set['funded_amount_months_left'] <= 1.2)), 'On_Track', main_data_set['spending_status'])

In [14]:
#main_data_set = main_data_set[{
#                              "member_key"
#                            , "first_name" 
#                            , "last_name" 
#                            , "budget_level1_key"
#                            , "budget_level1_name"
#                            , "budget_level2_key"
#                            , "budget_level2_name"
#                            , "plan_months_total"
#                            , "allocated_amount"
#                            , "spending_status"
#                            }].copy()

In [15]:
df_model = main_data_set[{
                          'plan_months_total'
                        , 'allocated_amount'
                        , 'spending_status'
                        }].copy()

In [16]:
# Total plan amount 
# Total plan lenght

x = df_model[{
                'plan_months_total'
              , 'allocated_amount'
            }].values

y = df_model['spending_status'].values

normalized_range = sklearn.preprocessing.MinMaxScaler(feature_range=(-1,1))
x = normalized_range.fit_transform(x)

In [17]:
# TRAINING MODEL
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=10)

In [18]:
scaler = StandardScaler()
scaler.fit(x_train)

x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [19]:
# MLPClassifier = MLPClassifier(hidden_layer_sizes=(10, 10, 10), max_iter=500)

model = LogisticRegression(random_state=0, multi_class='multinomial', penalty='none', solver='newton-cg', max_iter = 150, warm_start = False)#.fit(x_train, y_train)

In [20]:
splitter = StratifiedShuffleSplit(n_splits=3, test_size=0.33, random_state=0)
lst_accu_stratified = []
   
for train_index, test_index in splitter.split(x, y):
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = MLPClassifier.fit(x_train, y_train)
    lst_accu_stratified.append(model.score(x_test, y_test))
   
# Print the output.
print('List of possible accuracy:', lst_accu_stratified)
print('\nMaximum Accuracy That can be obtained from this model is:',
      max(lst_accu_stratified)*100, '%')
print('\nMinimum Accuracy:',
      min(lst_accu_stratified)*100, '%')
print('\nOverall Accuracy:',
      mean(lst_accu_stratified)*100, '%')
print('\nStandard Deviation is:', stdev(lst_accu_stratified))

List of possible accuracy: [0.7847947534612582, 0.7847947534612582, 0.7847947534612582]

Maximum Accuracy That can be obtained from this model is: 78.47947534612581 %

Minimum Accuracy: 78.47947534612581 %

Overall Accuracy: 78.47947534612581 %

Standard Deviation is: 0.0


In [21]:
train_df, test_df = train_test_split(df_model, test_size=0.2, random_state=11)

print(f'PROPORTION OF TARGET IN THE ORIGINAL DATA\n{df_model["spending_status"].value_counts() / len(df_model)}\n\n'+
      f'PROPORTION OF TARGET IN THE TRAINING SET\n{train_df["spending_status"].value_counts() / len(train_df)}\n\n'+
      f'PROPORTION OF TARGET IN THE TEST SET\n{test_df["spending_status"].value_counts() / len(test_df)}')




PROPORTION OF TARGET IN THE ORIGINAL DATA
Overspending     0.784752
Underspending    0.166907
On_Track         0.048341
Name: spending_status, dtype: float64

PROPORTION OF TARGET IN THE TRAINING SET
Overspending     0.784046
Underspending    0.165648
On_Track         0.050306
Name: spending_status, dtype: float64

PROPORTION OF TARGET IN THE TEST SET
Overspending     0.787575
Underspending    0.171944
On_Track         0.040481
Name: spending_status, dtype: float64


In [22]:
train_df,test_df = train_test_split(df_model,test_size=0.2,stratify=df_model['spending_status'],random_state=11)
print(f'PROPORTION OF TARGET IN THE ORIGINAL DATA\n{df_model["spending_status"].value_counts() / len(df_model)}\n\n'+
      f'PROPORTION OF TARGET IN THE TRAINING SET\n{train_df["spending_status"].value_counts() / len(train_df)}\n\n'+
      f'PROPORTION OF TARGET IN THE TEST SET\n{test_df["spending_status"].value_counts() / len(test_df)}')

PROPORTION OF TARGET IN THE ORIGINAL DATA
Overspending     0.784752
Underspending    0.166907
On_Track         0.048341
Name: spending_status, dtype: float64

PROPORTION OF TARGET IN THE TRAINING SET
Overspending     0.784748
Underspending    0.166951
On_Track         0.048301
Name: spending_status, dtype: float64

PROPORTION OF TARGET IN THE TEST SET
Overspending     0.784770
Underspending    0.166733
On_Track         0.048497
Name: spending_status, dtype: float64


In [23]:
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=11)
splits = kfold.split(df_model, df_model['spending_status'])

print(f'PROPORTION OF TARGET IN THE ORIGINAL DATA\n{df_model["spending_status"].value_counts() / len(df_model)}\n\n')
for n,(train_index, test_index) in enumerate(splits):
    print(f'SPLIT NO {n+1}\nTRAINING SET SIZE: {np.round(len(train_index) / (len(train_index)+len(test_index)), 2)}' +
          f'\tTEST SET SIZE: {np.round(len(test_index) / (len(train_index)+len(test_index)),2)}\nPROPORTION OF TARGET IN THE TRAINING SET\n'+
          f'{df_model.iloc[test_index,2].value_counts() / len(df_model.iloc[test_index,2])}\nPROPORTION OF TARGET IN THE TEST SET\n'+
          f'{df_model.iloc[train_index,2].value_counts() / len(df_model.iloc[train_index,2])}\n\n'
        )

PROPORTION OF TARGET IN THE ORIGINAL DATA
Overspending     0.784752
Underspending    0.166907
On_Track         0.048341
Name: spending_status, dtype: float64


SPLIT NO 1
TRAINING SET SIZE: 0.67	TEST SET SIZE: 0.33
PROPORTION OF TARGET IN THE TRAINING SET
1500.00      0.033670
100.14       0.011063
3000.00      0.009861
5000.00      0.007456
10000.00     0.005772
               ...   
33513.36     0.000241
457284.00    0.000241
37419.00     0.000241
18712.00     0.000241
256000.00    0.000241
Name: allocated_amount, Length: 2913, dtype: float64
PROPORTION OF TARGET IN THE TEST SET
1500.00      0.029942
3000.00      0.012266
100.14       0.011304
5000.00      0.008658
20000.00     0.005291
               ...   
6961.90      0.000120
35481.00     0.000120
10402.00     0.000120
295944.00    0.000120
16048.74     0.000120
Name: allocated_amount, Length: 5485, dtype: float64


SPLIT NO 2
TRAINING SET SIZE: 0.67	TEST SET SIZE: 0.33
PROPORTION OF TARGET IN THE TRAINING SET
1500.00      0.0317

In [24]:
unique, counts = np.unique(y_train, return_counts=True)
unique1, counts1 = np.unique(y_test, return_counts=True)
print(dict(zip(unique, counts)))
print(dict(zip(unique1, counts1)))

{'On_Track': 404, 'Overspending': 6558, 'Underspending': 1395}
{'On_Track': 199, 'Overspending': 3231, 'Underspending': 687}


In [25]:
splitter = StratifiedShuffleSplit(n_splits=3, test_size=0.33, random_state=0)

In [26]:
scores = []

for train_index, test_index in splitter.split(x, y):
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = MLPClassifier.fit(x_train, y_train)
    prediction_score = model.score(x_test, y_test)
    scores.append(prediction_score * 100)

print(scores)

[78.47947534612581, 78.47947534612581, 78.47947534612581]


In [27]:
model = model.fit(x_train, y_train)

In [28]:
unique, counts = np.unique(y_train, return_counts=True)
unique1, counts1 = np.unique(y_test, return_counts=True)
print(dict(zip(unique, counts)))
print(dict(zip(unique1, counts1)))

{'On_Track': 404, 'Overspending': 6558, 'Underspending': 1395}
{'On_Track': 199, 'Overspending': 3231, 'Underspending': 687}


In [29]:
y_pred = model.predict(x_test)
confmtrx = np.array(confusion_matrix(y_test, y_pred))

pd.DataFrame(confmtrx, index=['Underspending','Overspending','On_Track'],
columns=['predicted_Underspending', 'predicted_Overspending', 'predicted_On_Track'])

Unnamed: 0,predicted_Underspending,predicted_Overspending,predicted_On_Track
Underspending,0,199,0
Overspending,0,3231,0
On_Track,0,687,0


In [30]:
prediction_score = model.score(x_test, y_test)
print('Prediction score: ', prediction_score * 100)

Prediction score:  78.47947534612581


In [31]:
model_to_be_predicted = main_data_set[{
                                       'member_key'
                                     , 'first_name'
                                     , 'last_name'
                                     #, 'budget_level1_key'
                                     #, 'budget_level1_name'
                                     , 'budget_level2_key'
                                     , 'budget_level2_name'
                                     , 'plan_months_total'
                                     , 'allocated_amount'
                                     , 'spending_status'
                                    }].copy()

In [32]:
df_prediction = pd.DataFrame(columns = [
                                        'member_key'
                                      , 'first_name'
                                      , 'last_name'
                                      #, 'budget_level1_key'
                                      #, 'budget_level1_name'
                                      , 'budget_level2_key'
                                      , 'budget_level2_name'
                                      , 'prediction'
                                      ])

for index in range(len(model_to_be_predicted)):
    
    '''    
        plan_months_total = model_to_be_predicted['plan_months_total'].iloc[index]
        allocated_amount_total = model_to_be_predicted['allocated_amount'].iloc[index]
    
        spending_status = model_to_be_predicted['spending_status'].iloc[index]

    '''
    
    x = model_to_be_predicted[{
                                'plan_months_total'
                              , 'allocated_amount'
                             }]

    y = model_to_be_predicted['spending_status'].values

    normalized_range = sklearn.preprocessing.MinMaxScaler(feature_range=(-1,1))
    x = normalized_range.fit_transform(x)

    # test = model.predict(x, y)

    y_pred = model.predict(x)

    print(y_pred)

    '''
    test = model.predict(x, y)

    if test.astype(str) == 'Overspending':  
        status = 'Overspending'
    else:
        if test.astype(str) == 'Underspending':
            status = 'Underspending'
        else:
            status = 'On_Track'

    df = pd.DataFrame([ [
                         model_to_be_predicted['member_key'].iloc[index]
                       , model_to_be_predicted['first_name'].iloc[index]
                       , model_to_be_predicted['last_name'].iloc[index]
                       #, model_to_be_predicted['budget_level1_key'].iloc[index]
                       #, model_to_be_predicted['budget_level1_name'].iloc[index]
                       , model_to_be_predicted['budget_level2_key'].iloc[index]
                       , model_to_be_predicted['budget_level2_name'].iloc[index]
                       , status
                       ] ], columns=list(['member_key'
                                        , 'first_name'
                                        , 'last_name'
                                        #, 'budget_level1_key'
                                        #, 'budget_level1_name'
                                        , 'budget_level2_key'
                                        , 'budget_level2_name'
                                        , 'prediction']))

    
    df_prediction = df_prediction.append(df)
    '''

['Overspending' 'Overspending' 'Overspending' ... 'Overspending'
 'Overspending' 'Overspending']
['Overspending' 'Overspending' 'Overspending' ... 'Overspending'
 'Overspending' 'Overspending']
['Overspending' 'Overspending' 'Overspending' ... 'Overspending'
 'Overspending' 'Overspending']
['Overspending' 'Overspending' 'Overspending' ... 'Overspending'
 'Overspending' 'Overspending']
['Overspending' 'Overspending' 'Overspending' ... 'Overspending'
 'Overspending' 'Overspending']
['Overspending' 'Overspending' 'Overspending' ... 'Overspending'
 'Overspending' 'Overspending']
['Overspending' 'Overspending' 'Overspending' ... 'Overspending'
 'Overspending' 'Overspending']
['Overspending' 'Overspending' 'Overspending' ... 'Overspending'
 'Overspending' 'Overspending']
['Overspending' 'Overspending' 'Overspending' ... 'Overspending'
 'Overspending' 'Overspending']
['Overspending' 'Overspending' 'Overspending' ... 'Overspending'
 'Overspending' 'Overspending']
['Overspending' 'Overspending'

In [None]:
df_prediction['prediction'].unique()