In [1]:
# -------------------------- LIBRARIES NECESSARY IN THIS PROJECT  -------------------------- #

import pandas as pd
import numpy as np
import seaborn as sns
from datetime import *
import zipfile
import pylab as plt

import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

# -------------------------- FUNCTIONS -------------------------- #

def getDataSet(data_path):
    return pd.read_csv(data_path)

## Upload dataSets

### Members dataset cleasing

In [2]:
# -------------------------- PATH OF THE DATASETS USED IN THE PROJECT  -------------------------- #
zf = zipfile.ZipFile('../dataSets/main.zip')

main_data_set_all_fields = pd.read_csv(zf.open('main.csv'))

## Data manipulation and cleansing

In [3]:
# Check the dataSet fields
main_data_set_all_fields.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32950 entries, 0 to 32949
Data columns (total 26 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              32950 non-null  int64  
 1   member_key              32950 non-null  object 
 2   updated_at              32950 non-null  object 
 3   first_name              32950 non-null  object 
 4   last_name               32950 non-null  object 
 5   disabilities            8383 non-null   object 
 6   SA1                     3751 non-null   float64
 7   price_zone_code         32950 non-null  object 
 8   plan_key                32950 non-null  object 
 9   plan_status             32950 non-null  object 
 10  plan_start_date         32950 non-null  object 
 11  plan_end_date           32950 non-null  object 
 12  budget_level3_name      840 non-null    object 
 13  budget_level3_key       840 non-null    object 
 14  budget_level2_key       32950 non-null

In [4]:
main_data_set_all_fields.head(4)

Unnamed: 0.1,Unnamed: 0,member_key,updated_at,first_name,last_name,disabilities,SA1,price_zone_code,plan_key,plan_status,...,budget_level1_key,budget_level1_name,invoice_state,claim_state,allocated_amount,requested_amount,funded_amount,allocated_amount_total,requested_amount_total,funded_amount_total
0,0,00109970-7029-11eb-81d6-9d4df94b6224,2021-04,Wynny,Pressman,,,ACT_NSW_QLD_VIC,588427f1-e45d-4a6e-b307-9c68a13849ef,PLAN_DELIVERY_ACTIVE,...,58745466-f597-11e9-bfb4-022d4762bb3c,Core,ALL_PAID,PAID,50000.0,98.8,98.8,272168.34,11316.7,11316.7
1,1,00109970-7029-11eb-81d6-9d4df94b6224,2021-04,Wynny,Pressman,,,ACT_NSW_QLD_VIC,588427f1-e45d-4a6e-b307-9c68a13849ef,PLAN_DELIVERY_ACTIVE,...,58745466-f597-11e9-bfb4-022d4762bb3c,Core,ALL_PAID,PAID,222168.34,11217.9,11217.9,272168.34,11316.7,11316.7
2,2,0010f240-3b5d-11eb-aa73-7be7bbd3c758,2021-04,Beryl,McDuff,Asperger syndrome,3138648.0,ACT_NSW_QLD_VIC,ef286fe8-8b36-4d77-97a0-59f9ef3df9b8,PLAN_DELIVERY_ACTIVE,...,5874551b-f597-11e9-bfb4-022d4762bb3c,Capacity Building,ALL_PAID,PAID,9801.6,2559.6384,2559.67,9801.6,2559.6384,2559.67
3,3,00117a50-cbc6-11ea-805a-75fc51d39a77,2021-04,Jethro,Perchard,,,ACT_NSW_QLD_VIC,8aa26238-d922-4963-bcaf-427d24a610bc,COMPLETED,...,5874551b-f597-11e9-bfb4-022d4762bb3c,Capacity Building,ALL_PAID,PAID,64524.0,8503.65,8503.65,64524.0,8503.65,8503.65


In [5]:
main_data_No_NaN = main_data_set_all_fields.replace(np.nan, 'NA')

main_data_No_NaN = main_data_No_NaN[(main_data_No_NaN["plan_status"] == 'PLAN_DELIVERY_ACTIVE')]

In [6]:
# Select just the fields to be used
main_data_set = main_data_No_NaN.groupby([
                                           "member_key"
                                         , "first_name"
                                         , "last_name"
                                         , "budget_level1_key"
                                         , "budget_level1_name"
                                         , "plan_start_date"
                                         , "plan_end_date"
                                         , "allocated_amount_total"
                                        ]).agg({"funded_amount": "sum", "allocated_amount": "sum"}).reset_index().copy()

In [7]:
months_total = main_data_set[{
                              "member_key"
                            , "plan_start_date"
                            , "plan_end_date"}].copy()

months_total['plan_months_total'] = ((pd.to_datetime(months_total['plan_end_date']).dt.year
                                   -  pd.to_datetime(months_total['plan_start_date']).dt.year) * 12) + (pd.to_datetime(months_total['plan_end_date']).dt.month
                                   -  pd.to_datetime(months_total['plan_start_date']).dt.month)

months_total_merged = months_total[{"member_key", "plan_months_total"}]
                            
months_total_merged = months_total[{
                                    "member_key"
                                  , "plan_months_total"
                            }].groupby([
                                       "member_key"
                                     , "plan_months_total"
                                       ]).count().reset_index()

months_total_merged = pd.merge(main_data_set, months_total_merged, how="left", on=["member_key"])

In [8]:
allocated_amount_month = months_total_merged[{
                                              "member_key"
                                            , "budget_level1_key"
                                            , "plan_months_total"
                                            , "allocated_amount"
                                            }].copy()

allocated_amount_month["allocated_amount_month"] = allocated_amount_month["allocated_amount"] / allocated_amount_month["plan_months_total"]                                                    

allocated_amount_month = allocated_amount_month[{'member_key', 'budget_level1_key', 'allocated_amount_month'}]

months_total_merged_monthly = pd.merge(months_total_merged, allocated_amount_month, how="left", on=["member_key", "budget_level1_key"])

In [9]:
month_today = main_data_set[{
                           "member_key"
                         , "budget_level1_key"
                         , "plan_start_date"
                            }].copy()


month_today['month_actual'] = ((pd.to_datetime("today").year -  pd.to_datetime(month_today['plan_start_date']).dt.year) * 12)                                                                                                                     + (pd.to_datetime("today").month - pd.to_datetime(month_today['plan_start_date']).dt.month)

month_today = month_today[{'member_key', 'budget_level1_key', 'month_actual'}]

month_actual = pd.merge(months_total_merged_monthly, month_today, how="left", on=["member_key", "budget_level1_key"])

month_actual["spent_amount_predicted"] = month_actual["allocated_amount_month"] * month_actual["month_actual"]

In [10]:
month_actual['funded_amount'] = month_actual['funded_amount'].round(2)
month_actual['allocated_amount'] = month_actual['allocated_amount'].round(2)

month_actual['allocated_amount_month'] = month_actual['allocated_amount_month'].round(2)
month_actual['spent_amount_predicted'] = month_actual['spent_amount_predicted'].round(2)

month_actual['months_left'] = month_actual['plan_months_total'] - month_actual['month_actual']

In [11]:
month_actual['funded_amount_percentage'] = (month_actual['funded_amount'] / month_actual['allocated_amount']) * 100#).round(2)

month_actual['months_left_percentage'] = (month_actual['months_left'] / month_actual['plan_months_total']) * 100#).round(2)

month_actual['funded_amount_months_left'] = month_actual['funded_amount_percentage'] / month_actual['months_left_percentage']#).round(2)

month_actual = month_actual.loc[
                                (month_actual["plan_months_total"] >= 12)
                              & (month_actual["allocated_amount"] > 0)
                              & (month_actual["months_left"] > 0)
                               ]

In [12]:
month_actual['spending_status'] = np.where(month_actual['funded_amount_months_left'] < 0.9, 'Overspending', 'Underspending')

month_actual['spending_status'] = np.where(((month_actual['funded_amount_months_left'] >= 0.9) & (month_actual['funded_amount_months_left'] <= 1.2)), 'On_Track', month_actual['spending_status'])

In [13]:
main_data_set = month_actual[{
                              "member_key"
                            , "first_name" 
                            , "last_name" 
                            , "budget_level1_key"
                            , "budget_level1_name"
                            , "funded_amount_percentage"
                            , "months_left_percentage"
                            , "funded_amount_months_left"
                            , "spending_status"
                            }].copy()

In [14]:
df_model = main_data_set[{
                          'funded_amount_percentage'
                        , 'months_left_percentage'
                        , 'spending_status'
                        }].copy()

In [15]:
df_model.head()

Unnamed: 0,spending_status,months_left_percentage,funded_amount_percentage
0,Overspending,91.666667,4.157978
1,Overspending,33.333333,26.114818
2,Overspending,91.666667,1.234578
3,Overspending,91.666667,3.83502
4,Overspending,25.0,5.743543


In [16]:
x = df_model[{
                'funded_amount_percentage'
              , 'months_left_percentage'
            }]

y = df_model['spending_status']

normalized_range = sklearn.preprocessing.MinMaxScaler(feature_range=(-1,1))
x = normalized_range.fit_transform(x)

In [17]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

In [18]:
regression = LogisticRegression(random_state=0, multi_class='multinomial', penalty='none', solver='newton-cg', max_iter = 150, warm_start = False).fit(x_train, y_train)

regression.fit(x_train, y_train)

LogisticRegression(max_iter=150, multi_class='multinomial', penalty='none',
                   random_state=0, solver='newton-cg')

In [19]:
prediction_score = regression.score(x_test, y_test)
print('Prediction score: ', prediction_score * 100)

Prediction score:  99.80089596814335


In [20]:
print(regression.intercept_)

[  5653.65225242 -88995.48231651  83341.83002523]


In [21]:
print(regression.coef_)

[[ 8.66685854e+00  5.64443484e+03]
 [ 2.74072721e+02 -8.92937014e+04]
 [-2.82739567e+02  8.36492666e+04]]


In [22]:
y_pred = regression.predict(x_test)

In [23]:
confmtrx = np.array(confusion_matrix(y_test, y_pred))

pd.DataFrame(confmtrx, index=['Underspending','Overspending','On_Track'],
columns=['predicted_Underspending', 'predicted_Overspending', 'predicted_On_Track'])

Unnamed: 0,predicted_Underspending,predicted_Overspending,predicted_On_Track
Underspending,94,1,1
Overspending,0,1619,0
On_Track,2,0,292


In [24]:
model_to_be_predicted = main_data_set[{
                                       'member_key'
                                     , 'first_name'
                                     , 'last_name'
                                     , 'budget_level1_key'
                                     , 'budget_level1_name'
                                     , 'funded_amount_percentage'
                                     , 'months_left_percentage'
                                     , 'spending_status'
                                    }].copy()

In [25]:
df_prediction = pd.DataFrame(columns = ['member_key', 'first_name', 'last_name', 'budget_level1_key', 'budget_level1_name', 'prediction'])

for index in range(len(model_to_be_predicted)):
    funded_amount_percentage = model_to_be_predicted['funded_amount_percentage'].iloc[index]
    months_left_percentage = model_to_be_predicted['months_left_percentage'].iloc[index]
    spending_status = model_to_be_predicted['spending_status'].iloc[index]

    test = regression.predict((np.array([funded_amount_percentage, months_left_percentage]) .reshape(1, -1)))

    if test.astype(str) == 'Underspending':  
        status = 'Underspending'
    else:
        if test.astype(str) == 'Overspending':
            status = 'Overspending'
        else:
            status = 'On_Track'

    df = pd.DataFrame([ [
                         model_to_be_predicted['member_key'].iloc[index]
                       , model_to_be_predicted['first_name'].iloc[index]
                       , model_to_be_predicted['last_name'].iloc[index]
                       , model_to_be_predicted['budget_level1_key'].iloc[index]
                       , model_to_be_predicted['budget_level1_name'].iloc[index]
                       , status
                       ] ], columns=list(['member_key', 'first_name', 'last_name', 'budget_level1_key', 'budget_level1_name', 'prediction']))

    df_prediction = df_prediction.append(df)

df_prediction


In [26]:
#compression_opts = dict(method='zip',
#                        archive_name='prediction.csv')

#df_prediction.to_csv('../dataSets/prediction.zip', compression=compression_opts)

Unnamed: 0,member_key,first_name,last_name,budget_level1_key,budget_level1_name,prediction
0,00109970-7029-11eb-81d6-9d4df94b6224,Wynny,Pressman,58745466-f597-11e9-bfb4-022d4762bb3c,Core,Underspending
0,0010f240-3b5d-11eb-aa73-7be7bbd3c758,Beryl,McDuff,5874551b-f597-11e9-bfb4-022d4762bb3c,Capacity Building,Underspending
0,0011ebe0-7722-11eb-9ade-8f3368e47951,Eamon,Gammage,58745466-f597-11e9-bfb4-022d4762bb3c,Core,Underspending
0,0011ebe0-7722-11eb-9ade-8f3368e47951,Eamon,Gammage,5874551b-f597-11e9-bfb4-022d4762bb3c,Capacity Building,Underspending
0,007c1090-d6ca-11ea-bb8a-a73bf14c75f9,Janean,Potzold,58745466-f597-11e9-bfb4-022d4762bb3c,Core,Underspending
...,...,...,...,...,...,...
0,ff630c80-d094-11ea-847f-196f66acbe45,Mureil,Labb,5874551b-f597-11e9-bfb4-022d4762bb3c,Capacity Building,Underspending
0,ff656e00-7fce-11eb-8e57-d94003868067,Jeramie,Burgot,58745466-f597-11e9-bfb4-022d4762bb3c,Core,Underspending
0,ff656e00-7fce-11eb-8e57-d94003868067,Jeramie,Burgot,5874551b-f597-11e9-bfb4-022d4762bb3c,Capacity Building,Underspending
0,ffddfbc0-0861-11eb-92a0-2d24c44fa781,Allard,Postle,5874551b-f597-11e9-bfb4-022d4762bb3c,Capacity Building,Underspending
