In [None]:
# Import packages
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import ast
import random
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from IPython.display import SVG
from graphviz import Source
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegressionCV
import cvxpy as cp

# Load dataset
df = pd.read_csv('https://docs.google.com/uc?export=download&id=14CZBR0p358hjSolZLTUtMk5LCel7QeCG')

In [None]:
# Remove unnecessary features.
cols_to_keep = ["backers_count", "blurb", "category", "country", "created_at", "currency", "deadline", "goal", "name", "spotlight", "staff_pick", "state", "usd_pledged"]
df_new = df.filter(cols_to_keep)

# Remove all rows containing a 'live' state.
df_new = df_new.drop(df_new[df_new['state'] == 'live'].index)
df_new = df_new.drop(df_new[df_new['state'] == 'canceled'].index)
df_new.reset_index(drop=True, inplace=True)

# Change the 'blurb' column to contain the lengths of each blurb.
blurbs = df_new["blurb"]
blurb_lengths = []

for blurb in blurbs:
  blurb_lengths.append(len(blurb))

# Change the 'name' column to contain the lengths of each name.
names = df_new["name"]
name_lengths = []

for name in names:
  name.replace("(Canceled)", "")
  name_lengths.append(len(name))

# Calculate the deadlines of each project in days and append it to the dataframe.
duration = []

start = df_new["created_at"]
end = df_new["deadline"]

for i in range(len(end)):
  duration.append(round((end[i] - start[i])/60/60/24))

# Find the appropriate category from the dictionary of category attributes
# provided by the dataset.
categories = df_new["category"]
category_names = []

for category in categories:
  if 'parent_name' in ast.literal_eval(category):
      category_names.append(ast.literal_eval(category)["parent_name"])
  else:
    category_names.append(ast.literal_eval(category)["name"])

# Change the categorical spotlight value into a binary value.
binary_spotlight = []
spotlight = df_new['spotlight']

for item in spotlight:
  if (item == True):
      binary_spotlight.append(1)
  else:
      binary_spotlight.append(0)

# Change the categorical staffPick value into a binary value.
binary_staffPick = []
staffPick = df_new['staff_pick']

for item in staffPick:
  if (item == True):
      binary_staffPick.append(1)
  else:
      binary_staffPick.append(0)

# Change the categorical state value into a binary value. Assuming that the
# 'failed' and 'canceled states are identical.
binary_state = []
state = df_new['state']

for item in state:
  if (item == 'successful'):
      binary_state.append(1)
  else:
      binary_state.append(0)

usdPledged = df_new['usd_pledged']
goal = df_new['goal']
percentGoal = []

for idx in range(len(usdPledged)):
  percentGoal.append(usdPledged[idx] / goal[idx])

# Assign all the new values to the dataframe.
df_new["blurb"] = blurb_lengths
df_new["name"] = name_lengths
df_new["duration"] = duration
df_new["category"] = category_names
df_new["spotlight"] = binary_spotlight
df_new["staff_pick"] = binary_staffPick
df_new["state"] = binary_state
#df_new["percentGoal"] = percentGoal

# Change the names of 'blurb' and 'name' columns.
df_new = df_new.rename(columns={"blurb":"blurb_length", "name":"name_length"})

# Drop 'created_at' since it is not needed anymore, after creating 'deadline'.
df_new = df_new.drop(columns = "created_at")
df_new = df_new.drop(columns = "deadline")
df_new = df_new.drop(columns = "spotlight")
df_new = df_new.drop(columns = 'currency')
df_new = df_new.drop(columns = 'country')
df_new = df_new.drop(columns = "usd_pledged")
df_new = df_new.drop(columns = "backers_count")
#df_new = df_new.drop(columns = "goal")

df_test = pd.get_dummies(df_new, columns=['category'])

df = df_test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('state',1), df['state'], test_size=0.30, random_state=0)

In [None]:
def fit_and_score_model(mdl, X_train, X_test, y_train, y_test, random_state=0):
    """
    This function will fit and score the input mdl to the X_train and y_train
    data, and score the mdl on y_train and y_test. To ensure results are
    reproducible we can also set a random state.
    """

    # Fit an arbitrary model
    mdl.fit(X_train, y_train)

    # Calculate the score of the model on training and testing data
    train_score = mdl.score(X_train, y_train)
    test_score = mdl.score(X_test, y_test)

    # Print scores to terminal
    print('the accuracy on the: \n\t training data is {}'.format(round(train_score,3)))
    print('\t testing data is {}'.format(round(test_score, 3)))

    return train_score, test_score

In [None]:
mdl = RandomForestClassifier(n_estimators = 50, max_depth = 17, bootstrap = True, random_state = 0, max_samples = 0.2)
print('For max depth 17 ', end ='')
train, test = fit_and_score_model(mdl, X_train, X_test, y_train, y_test)

For max depth 17 the accuracy on the: 
	 training data is 0.831
	 testing data is 0.712


In [None]:
# # The catagories of projcts
# catagories = [ "Art", "Comics", "Crafts", "Dance", "Design", "Fashion",
#               "Film & Video", "Food", "Games", "Jornalisim", "Music", "Photography", "Publishing", "Technology", "Theater" ]

# # The user profiles we made
# user1 = pd.Series([0.01, 0.04, 0.03, 0.09, 0.05, 0.15, 0.05, 0.02, 0.05, 0.1, 0.15, 0.14, 0.01, 0.03, 0.08])
# user2 = pd.Series([0.05, 0.07, 0.01, 0.04, 0.02, 0.03, 0.08, 0.07, 0.15, 0.01, 0.05, 0.01, 0.08, 0.3, 0.03])
# user3 = pd.Series([0.02, 0.03, 0.04, 0.05, 0.05, 0.02, 0.13, 0.1,	0.01, 0.02, 0.03, 0.01, 0.1, 0.35, 0.04])

# # Convert into a data frame
# users = pd.DataFrame([list(user1), list(user2), list(user3)], columns = catagories)

# users

catagories = [ "Art", "Comics", "Crafts", "Dance", "Design", "Fashion",
               "Film & Video", "Food", "Games", "Jornalisim", "Music", "Photography", "Publishing", "Technology", "Theater" ]

# User likes all Categories evenly
user1 = pd.Series([0.06667, 0.06667, 0.06667, 0.06667, 0.06667, 0.06667, 0.06667, 0.06667, 0.06667, 0.06667, 0.06667, 0.06667, 0.06667, 0.06667, 0.06667])

# User likes Technology Dominantly
user2 = pd.Series([0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.86, 0.01])

# User likes Comics, Film & Video, Food, Games, Journalism, Music, Photography, Technology my an equal amount, more than the other categories
user3 = pd.Series([0.02, 0.1075, 0.02, 0.02, 0.02, 0.02, 0.1075, 0.1075, 0.1075, 0.1075, 0.1075, 0.1075, 0.02, 0.1075, 0.02])

users = pd.DataFrame([list(user1), list(user2), list(user3)], columns = catagories)

users

Unnamed: 0,Art,Comics,Crafts,Dance,Design,Fashion,Film & Video,Food,Games,Jornalisim,Music,Photography,Publishing,Technology,Theater
0,0.06667,0.06667,0.06667,0.06667,0.06667,0.06667,0.06667,0.06667,0.06667,0.06667,0.06667,0.06667,0.06667,0.06667,0.06667
1,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.86,0.01
2,0.02,0.1075,0.02,0.02,0.02,0.02,0.1075,0.1075,0.1075,0.1075,0.1075,0.1075,0.02,0.1075,0.02


In [None]:
users1

NameError: ignored

In [None]:
users2

In [None]:
users3

In [None]:
# A different dataset from our model to serve as input for the optimization model

df_input = pd.read_csv('https://drive.google.com/uc?export=download&id=1wGtVY6oIV5xtw7hqdRUXPkfNV50r3UC9')

cols_to_keep = ["blurb", "category", "created_at", "deadline", "goal", "name", "staff_pick"]
df_input = df_input.filter(cols_to_keep)

# Change the 'blurb' column to contain the lengths of each blurb.
blurbs = df_input["blurb"]
blurb_lengths = []

for blurb in blurbs:
  blurb_lengths.append(len(str(blurb)))

# Change the 'name' column to contain the lengths of each name.
names = df_input["name"]
name_lengths = []

for name in names:
  name.replace("(Canceled)", "")
  name_lengths.append(len(name))

# Calculate the deadlines of each project in days and append it to the dataframe.
duration = []

start = df_input["created_at"]
end = df_input["deadline"]

for i in range(len(end)):
  duration.append(round((end[i] - start[i])/60/60/24))

# Find the appropriate category from the dictionary of category attributes
# provided by the dataset.
categories = df_input["category"]
category_names = []

for category in categories:
  if 'parent_name' in ast.literal_eval(category):
      category_names.append(ast.literal_eval(category)["parent_name"])
  else:
    category_names.append(ast.literal_eval(category)["name"])

# Change the categorical staffPick value into a binary value.
binary_staffPick = []
staffPick = df_input['staff_pick']

for item in staffPick:
  if (item == True):
      binary_staffPick.append(1)
  else:
      binary_staffPick.append(0)

# Assign all the new values to the dataframe.
df_input["blurb"] = blurb_lengths
df_input["name"] = name_lengths
df_input["duration"] = duration
df_input["category"] = category_names
df_input["staff_pick"] = binary_staffPick

# Change the names of 'blurb' and 'name' columns.
df_input = df_input.rename(columns={"blurb":"blurb_length", "name":"name_length"})

# Drop 'created_at' since it is not needed anymore, after creating 'deadline'.
df_input = df_input.drop(columns = "created_at")
df_input = df_input.drop(columns = "deadline")

df_input = pd.get_dummies(df_input, columns=['category'])

# for i in df_input:
#   print(i)

In [None]:
minProb = []
minProb.append(0.15)
minProb.append(0.25)
minProb.append(0.35)
minProb.append(0.5)
minProb.append(0.65)
minProb.append(0.75)
minProb.append(0.9)

In [None]:
def generateProjectList(df_input):

  # Finds a subset of projects from each category
  project_list_temp = df_input.copy().drop(columns = ['blurb_length', 'goal', 'name_length', 'staff_pick', 'duration'])

  project_dict = {}

  for project_num in range(len(project_list_temp)):
      for category in project_list_temp:
        if project_list_temp[category][project_num] == 1:
          if category in project_dict.keys():
            project_dict[category].append(project_num)
          else:
            project_dict[category] = [project_num]

  num_projects_each_category = 2 # Change this to change the number of projects that is selected from each category
  indexes = []

  for i in project_dict:
    for j in range(num_projects_each_category):

      temp = random.randint(0, len(project_dict[i]))

      if j == 0:
        indexes.append(temp)
      else:
        if temp in indexes:
          j -= 1
          continue
        indexes.append(temp)

  picked_projects = []

  for idx in indexes:
    picked_projects.append(df_input.iloc[idx])

  project_list = pd.DataFrame(picked_projects)
  project_list.reset_index(drop=True, inplace=True)

  index_names = []

  for i in range(len(project_list)):
    index_names.append('Project {}'.format(i+1))

  # Finds the utility values for each of the projects for each user
  project_list_temp = project_list.copy().drop(columns = ['blurb_length', 'goal', 'name_length', 'staff_pick', 'duration'])

  utility_values_all_users = []

  for user_num in range(len(users)):
    temp = []
    for project_num in range(len(project_list_temp)):
      for category in project_list_temp:
        if project_list_temp[category][project_num] == 1.0:
          temp.append(users[category.split('_')[1]][user_num])

    utility_values_all_users.append(temp)

    # Calculates the probabilities of success for each project
    probabilities_lst = []

  prob_lst = mdl.predict_proba(project_list)[:,1]

  for prob in prob_lst:
    probabilities_lst.append(prob.round(2))

  probabilities = np.array(probabilities_lst)

  return project_list, index_names, utility_values_all_users, probabilities

In [None]:
# Generates random costs for projects
def generateCosts():
  costs_lst = []

  for i in range(len(project_list)):
    costs_lst.append(random.randint(15, 200))

  costs = np.array(costs_lst)

  return costs

In [None]:
def benchmark(user_num, costs):
  num_projects = len(project_list)
  x = cp.Variable(num_projects,boolean=True)

  utility_values = np.array(utility_values_all_users[user_num])
  obj = cp.Maximize(x*(utility_values))
  cons = []

  budget = 400 # Find a reasonable budget for each user

  cons.append(cp.sum(x*costs) <= budget)

  prob = cp.Problem(obj,cons)

  prob.solve(verbose=False)
  #print('The objective value is {}.'.format(obj.value))

  x_np_array = x.value.astype(int)
  x_values = pd.Series(x_np_array, index = index_names)
  selected = x_values[x_values == 1].index.values

  #print(x_values)
  #print("\nProjects Selected: ", selected, "\n------------------------------------------------------\n")

  return obj.value, len(selected)

In [None]:
def optimize(user_num, costs):
  num_projects = len(project_list)
  x = cp.Variable(num_projects,boolean=True)

  utility_values = np.array(utility_values_all_users[user_num])
  obj = cp.Maximize(x*(utility_values*probabilities))
  cons = []

  budget = 400 # Find a reasonable budget for each user

  cons.append(cp.sum(x*costs) <= budget)

  prob = cp.Problem(obj,cons)

  prob.solve(verbose=False)
  #print('The objective value is {}.'.format(obj.value))

  x_np_array = x.value.astype(int)
  x_values = pd.Series(x_np_array, index = index_names)
  selected = x_values[x_values == 1].index.values

  #print(x_values)
  #print("\nProjects Selected: ", selected, "\n------------------------------------------------------\n")

  return obj.value, len(selected)

In [None]:
def optimize_with_risk(user_num, costs):
  num_projects = len(project_list)
  x = cp.Variable(num_projects,boolean=True)

  utility_values = np.array(utility_values_all_users[0])
  obj = cp.Maximize(x*(utility_values*probabilities))
  cons = []

  budget = 400 # Find a reasonable budget for each user

  cons.append(cp.sum(x*costs) <= budget)
  for i in range(num_projects):
    cons.append(x[i]*probabilities[i] >= x[i]*minProb[user_num])

  prob = cp.Problem(obj,cons)

  prob.solve(verbose=False)
  #print('The objective value is {}.'.format(obj.value))

  x_np_array = x.value.astype(int)
  x_values = pd.Series(x_np_array, index = index_names)
  selected = x_values[x_values == 1].index.values

  #print(x_values)
  #print("\nProjects Selected: ", selected, "\n------------------------------------------------------\n")

  return obj.value, len(selected)

Adding the regularizer makes all the users choose the same projects and choose the same number of objects for some reason. Without it they choose different projects and different amounts of projects

In [None]:
numRepetitions = 30

avgTotal = {}
cols=["Project","Probability"]
#df_predict=pd.DataFrame(index=)
for i in range(numRepetitions):
  project_list, index_names, utility_values_all_users, probabilities = generateProjectList(df_input)

  avgResults = {}
  for j in range(numRepetitions):
    costs = generateCosts()
    for user in range(len(users)):

      totalObjVal, totalNumProjectsChosen = optimize(user, costs)

      if user in avgResults.keys():
        avgResults[user][0] += totalObjVal
        avgResults[user][1] += totalNumProjectsChosen
      else:
        avgResults[user] = [totalObjVal, totalNumProjectsChosen]

  for user in avgResults:
    if user in avgTotal.keys():
      avgTotal[user][0] += avgResults[user][0] / numRepetitions
      avgTotal[user][1] += avgResults[user][1] / numRepetitions

    else:
      avgTotal[user] = [avgResults[user][0] / numRepetitions, avgResults[user][1] / numRepetitions]

for i in avgTotal:
  print("For user {}".format(i))
  print("The average utility: {}" .format(avgTotal[i][0] / numRepetitions))
  print("The average number of projects chosen: {}\n" .format(avgTotal[i][1] / numRepetitions))

For user 0
The average utility: 0.3587357136666666
The average number of projects chosen: 8.056666666666667

For user 1
The average utility: 0.5642998888888889
The average number of projects chosen: 7.11111111111111

For user 2
The average utility: 0.3804386944444444
The average number of projects chosen: 6.459999999999999



Each backer has the same budget and has to choose from the same set of projects. The projects don't have equal costs.

In [None]:
numRepetitions = 30

avgTotal = {}

for i in range(numRepetitions):
  project_list, index_names, utility_values_all_users, probabilities = generateProjectList(df_input)
  avgResults = {}
  for j in range(numRepetitions):
    costs = generateCosts()
    for user in range(len(minProb)):

      totalObjVal, totalNumProjectsChosen = optimize_with_risk(user, costs)

      if user in avgResults.keys():
        avgResults[user][0] += totalObjVal
        avgResults[user][1] += totalNumProjectsChosen
      else:
        avgResults[user] = [totalObjVal, totalNumProjectsChosen]

  for user in avgResults:
    if user in avgTotal.keys():
      avgTotal[user][0] += avgResults[user][0] / numRepetitions
      avgTotal[user][1] += avgResults[user][1] / numRepetitions

    else:
      avgTotal[user] = [avgResults[user][0] / numRepetitions, avgResults[user][1] / numRepetitions]

for i in avgTotal:
  print("For user {}".format(i))
  print("The average utility: {}" .format(avgTotal[i][0] / numRepetitions))
  print("The average number of projects chosen: {}\n" .format(avgTotal[i][1] / numRepetitions))

For user 0
The average utility: 0.3517353636666666
The average number of projects chosen: 8.103333333333332

For user 1
The average utility: 0.35093236055555554
The average number of projects chosen: 7.998888888888889

For user 2
The average utility: 0.34795369311111113
The average number of projects chosen: 7.751111111111111

For user 3
The average utility: 0.333376668
The average number of projects chosen: 6.973333333333334

For user 4
The average utility: 0.2845468192222222
The average number of projects chosen: 5.442222222222222

For user 5
The average utility: 0.2245497454444444
The average number of projects chosen: 4.035555555555556

For user 6
The average utility: 0.05311376666666669
The average number of projects chosen: 0.8577777777777779



In [None]:
numRepetitions = 30

avgTotal = {}

for i in range(numRepetitions):
  project_list, index_names, utility_values_all_users, probabilities = generateProjectList(df_input)
  avgResults = {}
  for j in range(numRepetitions):
    costs = generateCosts()
    for user in range(len(users)):

      totalObjVal, totalNumProjectsChosen = benchmark(user, costs)

      if user in avgResults.keys():
        avgResults[user][0] += totalObjVal
        avgResults[user][1] += totalNumProjectsChosen
      else:
        avgResults[user] = [totalObjVal, totalNumProjectsChosen]

  for user in avgResults:
    if user in avgTotal.keys():
      avgTotal[user][0] += avgResults[user][0] / numRepetitions
      avgTotal[user][1] += avgResults[user][1] / numRepetitions

    else:
      avgTotal[user] = [avgResults[user][0] / numRepetitions, avgResults[user][1] / numRepetitions]

for i in avgTotal:
  print("For user {}".format(i))
  print("The average utility: {}" .format(avgTotal[i][0] / numRepetitions))
  print("The average number of projects chosen: {}\n" .format(avgTotal[i][1] / numRepetitions))

For user 0
The average utility: 0.5702507333333333
The average number of projects chosen: 8.553333333333335

For user 1
The average utility: 1.336277777777778
The average number of projects chosen: 7.261111111111109

For user 2
The average utility: 0.6912722222222224
The average number of projects chosen: 7.467777777777775



In [None]:
total_bench = [0 ,0, 0]
total_opt = [0, 0, 0]

num_reps = 1

for i in range(num_reps):
  project_list, index_names, utility_values_all_users, probabilities = generateProjectList(df_input)
  for j in range(3):
    user = j
    costs = generateCosts()

    obj, selected = benchmark(user, costs)

    utils = utility_values_all_users[user]

    probs_chosen = []
    utils_chosen = []

    for idx in selected:
      probs_chosen.append(probabilities[(int)(idx.split(" ")[1]) - 1])
      utils_chosen.append(utils[(int)(idx.split(" ")[1]) - 1])

    corrected_obj = 0

    for f in range(len(probs_chosen)):
      corrected_obj += probs_chosen[f] * utils_chosen[f]

    obj_optimized, temp = optimize(user, costs)

    total_bench[j] += corrected_obj
    total_opt[j] += obj_optimized

for i in range(3):
  print("For User {}:\n\tBenchmark: {}\n\tOptimized: {}\n".format(i, total_bench[i]/num_reps, total_opt[i]/num_reps))

For User 0:
	Benchmark: 0.3240162
	Optimized: 0.3340167

For User 1:
	Benchmark: 0.030900000000000004
	Optimized: 0.0421

For User 2:
	Benchmark: 0.202675
	Optimized: 0.256925

