In [1]:
# -------------------------- LIBRARIES NECESSARY IN THIS PROJECT  -------------------------- #

import pandas as pd
import numpy as np
import seaborn as sns
from datetime import *
import zipfile
import pylab as plt

import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

# -------------------------- FUNCTIONS -------------------------- #

def getDataSet(data_path):
    return pd.read_csv(data_path)

## Upload dataSets

### Members dataset cleasing

In [None]:
# -------------------------- PATH OF THE DATASETS USED IN THE PROJECT  -------------------------- #
zf = zipfile.ZipFile('../dataSets/main.zip')

main_data_set_all_fields = pd.read_csv(zf.open('main.csv'))

## Data manipulation and cleansing

In [None]:
# Check the dataSet fields
main_data_set_all_fields.info()

In [None]:
main_data_set_all_fields.head(4)

In [None]:
main_data_No_NaN = main_data_set_all_fields.replace(np.nan, 'NA')

main_data_No_NaN = main_data_No_NaN[(main_data_No_NaN["plan_status"] == 'PLAN_DELIVERY_ACTIVE')]

In [None]:
# Select just the fields to be used
main_data_set = main_data_No_NaN.groupby([
                                           "member_key"
                                         , "first_name"
                                         , "last_name"
                                         , "budget_level1_key"
                                         , "budget_level1_name"
                                         , "plan_start_date"
                                         , "plan_end_date"
                                         , "allocated_amount_total"
                                        ]).agg({"funded_amount": "sum", "allocated_amount": "sum"}).reset_index().copy()

In [None]:
months_total = main_data_set[{
                              "member_key"
                            , "plan_start_date"
                            , "plan_end_date"}].copy()

months_total['plan_months_total'] = ((pd.to_datetime(months_total['plan_end_date']).dt.year
                                   -  pd.to_datetime(months_total['plan_start_date']).dt.year) * 12) + (pd.to_datetime(months_total['plan_end_date']).dt.month
                                   -  pd.to_datetime(months_total['plan_start_date']).dt.month)

months_total_merged = months_total[{"member_key", "plan_months_total"}]
                            
months_total_merged = months_total[{
                                    "member_key"
                                  , "plan_months_total"
                            }].groupby([
                                       "member_key"
                                     , "plan_months_total"
                                       ]).count().reset_index()

months_total_merged = pd.merge(main_data_set, months_total_merged, how="left", on=["member_key"])

In [None]:
allocated_amount_month = months_total_merged[{
                                              "member_key"
                                            , "budget_level1_key"
                                            , "plan_months_total"
                                            , "allocated_amount"
                                            }].copy()

allocated_amount_month["allocated_amount_month"] = allocated_amount_month["allocated_amount"] / allocated_amount_month["plan_months_total"]                                                    

allocated_amount_month = allocated_amount_month[{'member_key', 'budget_level1_key', 'allocated_amount_month'}]

months_total_merged_monthly = pd.merge(months_total_merged, allocated_amount_month, how="left", on=["member_key", "budget_level1_key"])

In [None]:
month_today = main_data_set[{
                           "member_key"
                         , "budget_level1_key"
                         , "plan_start_date"
                            }].copy()


month_today['month_actual'] = ((pd.to_datetime("today").year -  pd.to_datetime(month_today['plan_start_date']).dt.year) * 12)                                                                                                                     + (pd.to_datetime("today").month - pd.to_datetime(month_today['plan_start_date']).dt.month)

month_today = month_today[{'member_key', 'budget_level1_key', 'month_actual'}]

month_actual = pd.merge(months_total_merged_monthly, month_today, how="left", on=["member_key", "budget_level1_key"])

month_actual["spent_amount_predicted"] = month_actual["allocated_amount_month"] * month_actual["month_actual"]

In [None]:
month_actual['funded_amount'] = month_actual['funded_amount'].round(2)
month_actual['allocated_amount'] = month_actual['allocated_amount'].round(2)

month_actual['allocated_amount_month'] = month_actual['allocated_amount_month'].round(2)
month_actual['spent_amount_predicted'] = month_actual['spent_amount_predicted'].round(2)

month_actual['months_left'] = month_actual['plan_months_total'] - month_actual['month_actual']

In [None]:
month_actual['funded_amount_percentage'] = (month_actual['funded_amount'] / month_actual['allocated_amount']) * 100#).round(2)

month_actual['months_left_percentage'] = (month_actual['months_left'] / month_actual['plan_months_total']) * 100#).round(2)

month_actual['funded_amount_months_left'] = month_actual['funded_amount_percentage'] / month_actual['months_left_percentage']#).round(2)

month_actual = month_actual.loc[
                                (month_actual["plan_months_total"] >= 12)
                              & (month_actual["allocated_amount"] > 0)
                              & (month_actual["months_left"] > 0)
                               ]

In [None]:
month_actual['spending_status'] = np.where(month_actual['funded_amount_months_left'] < 0.9, 'Overspending', 'Underspending')

month_actual['spending_status'] = np.where(((month_actual['funded_amount_months_left'] >= 0.9) & (month_actual['funded_amount_months_left'] <= 1.2)), 'On_Track', month_actual['spending_status'])

In [None]:
main_data_set = month_actual[{
                              "member_key"
                            , "first_name" 
                            , "last_name" 
                            , "budget_level1_key"
                            , "budget_level1_name"
                            , "funded_amount_percentage"
                            , "months_left_percentage"
                            , "funded_amount_months_left"
                            , "spending_status"
                            }].copy()

In [None]:
df_model = main_data_set[{
                          'funded_amount_percentage'
                        , 'months_left_percentage'
                        , 'spending_status'
                        }].copy()

In [None]:
df_model.head()

In [None]:
x = df_model[{
                'funded_amount_percentage'
              , 'months_left_percentage'
            }]

y = df_model['spending_status']

normalized_range = sklearn.preprocessing.MinMaxScaler(feature_range=(-1,1))
x = normalized_range.fit_transform(x)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

In [None]:
regression = LogisticRegression(random_state=0, multi_class='multinomial', penalty='none', solver='newton-cg', max_iter = 150, warm_start = False).fit(x_train, y_train)

regression.fit(x_train, y_train)

In [None]:
prediction_score = regression.score(x_test, y_test)
print('Prediction score: ', prediction_score * 100)

In [None]:
print(regression.intercept_)

In [None]:
print(regression.coef_)

In [None]:
y_pred = regression.predict(x_test)

In [None]:
confmtrx = np.array(confusion_matrix(y_test, y_pred))

pd.DataFrame(confmtrx, index=['Underspending','Overspending','On_Track'],
columns=['predicted_Underspending', 'predicted_Overspending', 'predicted_On_Track'])

In [None]:
model_to_be_predicted = main_data_set[{
                                       'member_key'
                                     , 'first_name'
                                     , 'last_name'
                                     , 'budget_level1_key'
                                     , 'budget_level1_name'
                                     , 'funded_amount_percentage'
                                     , 'months_left_percentage'
                                     , 'spending_status'
                                    }].copy()

In [None]:
df_prediction = pd.DataFrame(columns = ['member_key', 'first_name', 'last_name', 'budget_level1_key', 'budget_level1_name', 'prediction'])

for index in range(len(model_to_be_predicted)):
    member_key = (' member_key: ' + model_to_be_predicted['member_key'].iloc[index]
                + ' First name: ' + model_to_be_predicted['first_name'].iloc[index]
                + ' in Level 1:' + model_to_be_predicted['budget_level1_name'].iloc[index])

    funded_amount_percentage = model_to_be_predicted['funded_amount_percentage'].iloc[index]
    months_left_percentage = model_to_be_predicted['months_left_percentage'].iloc[index]
    spending_status = model_to_be_predicted['spending_status'].iloc[index]

    test = regression.predict((np.array([funded_amount_percentage, months_left_percentage]) .reshape(1, -1)))

    if test.astype(str) == 'Underspending':  
        status = 'Underspending'
    else:
        if test.astype(str) == 'Overspending':
            status = 'Overspending'
        else:
            status = 'On_Track'

    df = pd.DataFrame([ [
                         model_to_be_predicted['member_key'].iloc[index]
                       , model_to_be_predicted['first_name'].iloc[index]
                       , model_to_be_predicted['last_name'].iloc[index]
                       , model_to_be_predicted['budget_level1_key'].iloc[index]
                       , model_to_be_predicted['budget_level1_name'].iloc[index]
                       , status
                       ] ], columns=list(['member_key', 'first_name', 'last_name', 'budget_level1_key', 'budget_level1_name', 'prediction']))

    df_prediction = df_prediction.append(df)

df_prediction


In [None]:
#compression_opts = dict(method='zip',
#                        archive_name='prediction.csv')

#df_prediction.to_csv('../dataSets/prediction.zip', compression=compression_opts)