In [1]:
# -------------------------- LIBRARIES NECESSARY IN THIS PROJECT  -------------------------- #

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.optimize import fmin
import warnings
#warnings.filterwarnings('ignore')
#%matplotlib inline
import zipfile

import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# -------------------------- FUNCTIONS -------------------------- #

def getDataSet(data_path):
    return pd.read_csv(data_path)

def sigmoid(z):
    denom = 1+np.e**(0-z)
    return 1/denom

def cost_function(theta, X, y):
    m=np.size(y)
    
    h_theta_x = sigmoid(np.dot(X,theta))
    term1 = (0-y)*np.log(h_theta_x)
    term2 = (1-y)*np.log(1-h_theta_x)
    J = (np.sum(term1-term2))/m
    
    grad = np.dot(np.transpose(h_theta_x - y),X)
    grad = grad/m
    
    return (J, grad)

def gradient_descent(X, y, theta,
                    alpha=0.01, num_iterations=1500):
    """
    Solve for theta using Gradient Descent optimiztion technique. 
    Alpha is the learning rate
    """
    m = len(y)
    J_history = []
    theta0_history = []
    theta1_history = []
    theta2_history = []
    theta = theta.reshape(3,1)
    
    for i in range(num_iterations):
        error = (np.dot(X, theta) - y)
        
        term0 = (alpha/m) * np.sum(error* X[:,0].reshape(m,1))
        term1 = (alpha/m) * np.sum(error* X[:,1].reshape(m,1))
        term2 = (alpha/m) * np.sum(error* X[:,2].reshape(m,1))
        
        # update theta
        term_vector = np.array([[term0],[term1], [term2]])
#         print(term_vector)
        theta = theta - term_vector.reshape(3,1)
        
        # store history values
        theta0_history.append(theta[0].tolist()[0])
        theta1_history.append(theta[1].tolist()[0])
        theta2_history.append(theta[2].tolist()[0])
        J_history.append(cost_function(theta,X,y)[0])
        
    return (theta, J_history, theta0_history, theta1_history, theta2_history)      

In [2]:
# -------------------------- PATH OF THE DATASETS USED IN THE PROJECT  -------------------------- #
zf = zipfile.ZipFile('dataSets/prediction.zip')

main_data_set_all_fields = pd.read_csv(zf.open('prediction.csv'))

In [3]:
 # main_data_set_all_fields

In [4]:
#df_graph = main_data_set_all_fields[{'spent_amount_predicted_percentage', 'funded_amount_percentage', 'spending_status'}].loc[
#                                                                                                      (main_data_set_all_fields['allocated_amount_predicted'] >0)
#                                                                                                    & (main_data_set_all_fields['funded_amount'] >0)
#                                                                                                      ]

#df_graph['allocated_amount_predicted'] = df_graph['allocated_amount_predicted'].apply(2)
#df_graph['funded_amount'] = df_graph['funded_amount'].apply(2)

df_graph = main_data_set_all_fields[{'spent_amount_predicted_percentage', 'funded_amount_percentage', 'spending_status'}].loc[
                                                                                                      (main_data_set_all_fields['spent_amount_predicted_percentage'] >0)
                                                                                                    & (main_data_set_all_fields['funded_amount_percentage'] >0)

                                                                                                    & (main_data_set_all_fields['spending_status'] != 2)

                                                                                                                            ]
              
#df_graph['spent_amount_predicted_percentage'] = df_graph['spent_amount_predicted_percentage'].astype(int)
#df_graph['funded_amount_percentage'] = df_graph['funded_amount_percentage'].astype(int)

#df_model = df_graph[ (df_graph['spending_status'] != 2) & ((df_graph['spent_amount_predicted_percentage'] >0) & (df_graph['funded_amount_percentage'] >0))]

df_model = df_graph

df_model

Unnamed: 0,spent_amount_predicted_percentage,funded_amount_percentage,spending_status
0,55.570178,5.246879,0
1,66.666667,26.114818,0
2,33.333333,3.703735,0
3,8.333333,3.835020,0
4,678.800562,41.358973,0
...,...,...,...
8720,91.666667,10.889372,0
8721,16.666667,6.812000,0
8722,16.666667,3.291461,0
8723,66.666667,29.276512,0


In [5]:
normalized_range = sklearn.preprocessing.MinMaxScaler(feature_range=(-1,1))

In [6]:
normalized_range

MinMaxScaler(feature_range=(-1, 1))

In [7]:
features = df_model[{'spent_amount_predicted_percentage', 'funded_amount_percentage'}]
labels = df_model['spending_status']

features = normalized_range.fit_transform(features)

features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.25)

logisticRegression = LogisticRegression()
logisticRegression.fit(features_train, labels_train)

scikit_score = logisticRegression.score(features_test, labels_test)
print('Prediction score: ', scikit_score * 100)


Prediction score:  91.88073394495413


In [8]:
#model_to_be_predicted = main_data_set_all_fields[{'member_key', 'spent_amount_predicted_percentage', 'funded_amount_percentage'}] 
model_to_be_predicted = main_data_set_all_fields

for index in range(len(model_to_be_predicted)):
    #member_key = ' member_key: ' +model_to_be_predicted['member_key'].iloc[index] + ' First name: ' + model_to_be_predicted['first_name'].iloc[index] + ' in Level 1 == ' + model_to_be_predicted['budget_level1_name'].iloc[index]
    member_key = ' member_key: ' +model_to_be_predicted['member_key'].iloc[index] + ' First name: ' + model_to_be_predicted['first_name'].iloc[index] + ' in Level 1 == ' + model_to_be_predicted['budget_level1_name'].iloc[index]

    funded_amount_percentage = model_to_be_predicted['funded_amount_percentage'].iloc[index]
    spent_percentage = model_to_be_predicted['spent_amount_predicted_percentage'].iloc[index]

    test = logisticRegression.predict((np.array([spent_percentage, funded_amount_percentage]).reshape(1, -1)))
    if test == 0:  
        print(member_key + ' -------------- UNDERSPENDING')
    else:
        print(member_key + ' -------------- OVERSPENDING')

 member_key: 00109970-7029-11eb-81d6-9d4df94b6224 First name: Wynny in Level 1 == Core ------------ UNDERSPENDING
 member_key: 0010f240-3b5d-11eb-aa73-7be7bbd3c758 First name: Beryl in Level 1 == Capacity Building ------------ UNDERSPENDING
 member_key: 0011ebe0-7722-11eb-9ade-8f3368e47951 First name: Eamon in Level 1 == Core ------------ UNDERSPENDING
 member_key: 0011ebe0-7722-11eb-9ade-8f3368e47951 First name: Eamon in Level 1 == Capacity Building ------------ UNDERSPENDING
 member_key: 007c1090-d6ca-11ea-bb8a-a73bf14c75f9 First name: Janean in Level 1 == Core ------------ UNDERSPENDING
 member_key: 007c1090-d6ca-11ea-bb8a-a73bf14c75f9 First name: Janean in Level 1 == Capacity Building ------------ UNDERSPENDING
 member_key: 0092d9f0-cbb3-11ea-a1e4-8bc6be8edc1a First name: Cindelyn in Level 1 == Core ------------ UNDERSPENDING
 member_key: 0092d9f0-cbb3-11ea-a1e4-8bc6be8edc1a First name: Cindelyn in Level 1 == Capacity Building ------------ UNDERSPENDING
 member_key: 009cfe40-c630-1