# Analysis on Kickstarter

In [None]:
# Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
from rdd import rdd
# Show all columns
pd.set_option('display.max_columns', None)


In [None]:
# Load the data
df = pd.read_json('../data/creator_first_project.json', lines = True)
df.shape

# Visualize the Data

In [None]:
# New Kickstarter Creators by Year
df['year'] = df['launched_at'].dt.year
df['month'] = df['launched_at'].dt.month

# Plot
plt.figure(figsize=(10,6))
sns.countplot(data=df, x='year')
plt.title('New Kickstarter Creators by Year')

## Check first project statistics

In [None]:
# Convert some key statistics to integers
df['spotlight'] = df['spotlight'].astype(int)

# Replace missing with zeros for prelaunch_activated
df['prelaunch_activated'] = df['prelaunch_activated'].fillna(0)

In [None]:
df[['goal', 'pledged', 'staff_pick', 'backers_count', 'spotlight', 'prelaunch_activated']].describe().apply(lambda s: s.apply('{0:.2f}'.format))

Define the Discontinuity

In [None]:
# Create Discontinuity Columns
df['percent_to_goal'] = df['pledged'] / df['goal']
df['dollars_to_goal'] = df['pledged']  - df['goal']

## Percent to Goal

In [None]:
# Create a plotting dataframe
# Define the cutoff point
cutoff_perc = 1  # Adjust this value based on your specific cutoff criteria


def generate_plotting_df(variable, cuttoff_perc, bins):
    # Create a copy of the DataFrame
    plotting_df = df.copy()

    # Remove outliers
    plotting_df = plotting_df[(plotting_df[variable] <= 2) & (plotting_df[variable] > 0)]

    # Split data into left and right of the cutoff
    left_df = plotting_df[plotting_df[variable] < cutoff_perc]
    right_df = plotting_df[plotting_df[variable] >= cutoff_perc]

    # Bin each subset separately
    left_df[f"{variable}_binned"] = pd.cut(left_df[variable], bins=20)
    right_df[f"{variable}_binned"] = pd.cut(right_df[variable], bins=20)

    # Aggregate the binned data for each subset
    left_binned_data = left_df.groupby(f"{variable}_binned").agg({
        'total_successful_after_first': 'mean',
        'total_failed_after_first': 'mean',
        'total_canceled_after_first': 'mean',
        'total_raised_after_first': 'sum',
        'total_campaigns_after_first': 'mean',
        variable: 'size'  # Counts the number of observations in each bin
    }).reset_index().rename(columns={variable: 'n_obs'})
    right_binned_data = right_df.groupby(f"{variable}_binned").agg({
        'total_successful_after_first': 'mean',
        'total_failed_after_first': 'mean',
        'total_canceled_after_first': 'mean',
        'total_raised_after_first': 'sum',
        'total_campaigns_after_first': 'mean',
        variable: 'size'  # Counts the number of observations in each bin
    }).reset_index().rename(columns={variable: 'n_obs'})

    # Add a midpoint for each bin for both left and right data
    left_binned_data[variable] = left_binned_data[f"{variable}_binned"].apply(lambda interval: interval.mid)
    right_binned_data[variable] = right_binned_data[f"{variable}_binned"].apply(lambda interval: interval.mid)

    # Combine the binned data for both sides of the cutoff
    plotting_df = pd.concat([left_binned_data, right_binned_data], ignore_index=True)

    # Add Threshold Column
    plotting_df['threshold'] = plotting_df[variable] >= cutoff_perc
    return plotting_df

plotting_df = generate_plotting_df('percent_to_goal', cutoff_perc, 20)

# Plot percent to goal on x axis and total number of projects on y axis
plt.figure(figsize=(10,6))
plt.xlim(0, 2)
sns.histplot(data=plotting_df, x='percent_to_goal', weights = 'n_obs')

In [None]:
plt.figure(figsize=(15,15))
ax = plt.subplot(5, 1, 1)

sns.scatterplot(data=plotting_df, x = 'percent_to_goal', y = 'total_successful_after_first', ax = ax)
ax.axvline(x = 1, color = 'red', linestyle = '--')
ax.set_xlabel('')

ax = plt.subplot(5, 1, 2)
sns.scatterplot(data = plotting_df, x = 'percent_to_goal', y = 'total_failed_after_first', ax = ax)
ax.axvline(x = 1, color = 'red', linestyle = '--')
ax.set_xlabel('')

ax = plt.subplot(5, 1, 3)
sns.scatterplot(data = plotting_df, x = 'percent_to_goal', y = 'total_canceled_after_first', ax = ax)
ax.axvline(x = 1, color = 'red', linestyle = '--')
ax.set_xlabel('')

ax = plt.subplot(5, 1, 4)
sns.scatterplot(data = plotting_df, x = 'percent_to_goal', y = 'total_campaigns_after_first', ax = ax)
ax.axvline(x = 1, color = 'red', linestyle = '--')
ax.set_xlabel('')

ax = plt.subplot(5, 1, 5)
sns.scatterplot(data = plotting_df, x = 'percent_to_goal', y = 'total_raised_after_first', ax = ax)
ax.axvline(x = 1, color = 'red', linestyle = '--')
ax.set_xlabel('Percent to Goal')


In [None]:
plotting_df

## Dollars to Goal

In [None]:
# Create a plotting dataframe
# Define the cutoff point
cutoff_abs = 0  # Adjust this value based on your specific cutoff criteria

# Create a copy of the DataFrame
plotting_df = df.copy()

# Remove outliers
plotting_df = plotting_df[(plotting_df['dollars_to_goal'] <= 10000) & (plotting_df['dollars_to_goal'] >= -10000)]
# Split data into left and right of the cutoff
left_df = plotting_df[(plotting_df['state'] == 'failed') & (plotting_df['dollars_to_goal'] < cutoff_abs)]
right_df = plotting_df[(plotting_df['state'] == 'successful') & (plotting_df['dollars_to_goal'] >= cutoff_abs)]

# Bin each subset separately
left_df['dollars_to_goal_binned'] = pd.cut(left_df['dollars_to_goal'], bins=10)
right_df['dollars_to_goal_binned'] = pd.cut(right_df['dollars_to_goal'], bins=10)

# Aggregate the binned data for each subset
left_binned_data = left_df.groupby('dollars_to_goal_binned').agg({
    'total_successful_after_first': 'mean',
    'total_failed_after_first': 'mean',
    'total_canceled_after_first': 'mean',
    'total_raised_after_first': 'sum',
    'total_campaigns_after_first': 'mean',
    'dollars_to_goal': 'size'  # Counts the number of observations in each bin
}).reset_index().rename(columns={'dollars_to_goal': 'n_obs'})
right_binned_data = right_df.groupby('dollars_to_goal_binned').agg({
    'total_successful_after_first': 'mean',
    'total_failed_after_first': 'mean',
    'total_canceled_after_first': 'mean',
    'total_raised_after_first': 'sum',
    'total_campaigns_after_first': 'mean',
    'dollars_to_goal': 'size'  # Counts the number of observations in each bin
}).reset_index().rename(columns={'dollars_to_goal': 'n_obs'})

# Add a midpoint for each bin for both left and right data
left_binned_data['dollars_to_goal'] = left_binned_data['dollars_to_goal_binned'].apply(lambda interval: interval.mid)
right_binned_data['dollars_to_goal'] = right_binned_data['dollars_to_goal_binned'].apply(lambda interval: interval.mid)

# Combine the binned data for both sides of the cutoff
plotting_df = pd.concat([left_binned_data, right_binned_data], ignore_index=True)

# Add Threshold Column
plotting_df['threshold'] = plotting_df['dollars_to_goal'] >= cutoff_abs

# Plot percent to goal on x axis and total number of projects on y axis
plt.figure(figsize=(10,6))
plt.xlim(-10000, 10000)
sns.histplot(data=plotting_df, x='dollars_to_goal', weights = 'n_obs', bins = 20)

In [None]:
plt.figure(figsize=(15,15))
ax = plt.subplot(5, 1, 1)

sns.scatterplot(data=plotting_df, x = 'dollars_to_goal', y = 'total_successful_after_first', ax = ax)
ax.axvline(x = 1, color = 'red', linestyle = '--')
ax.set_xlabel('')

ax = plt.subplot(5, 1, 2)
sns.scatterplot(data = plotting_df, x = 'dollars_to_goal', y = 'total_failed_after_first', ax = ax)
ax.axvline(x = 1, color = 'red', linestyle = '--')
ax.set_xlabel('')

ax = plt.subplot(5, 1, 3)
sns.scatterplot(data = plotting_df, x = 'dollars_to_goal', y = 'total_canceled_after_first', ax = ax)
ax.axvline(x = 1, color = 'red', linestyle = '--')
ax.set_xlabel('')

ax = plt.subplot(5, 1, 4)
sns.scatterplot(data = plotting_df, x = 'dollars_to_goal', y = 'total_campaigns_after_first', ax = ax)
ax.axvline(x = 1, color = 'red', linestyle = '--')
ax.set_xlabel('')

ax = plt.subplot(5, 1, 5)
sns.scatterplot(data = plotting_df, x = 'dollars_to_goal', y = 'total_raised_after_first', ax = ax)
ax.axvline(x = 1, color = 'red', linestyle = '--')
ax.set_xlabel('Dollars to Goal')


In [None]:
plotting_df

# Discontinuity Analysis

Kernel weighted RD

In [None]:
def kernel(R, c, h):
    indicator = (np.abs(R - c) <= h).astype(float)
    return indicator * (1 - np.abs(R - c) / h)

## Percent to Goal

In [None]:
rdd_df = df.assign(threshold = (df['percent_to_goal'] > 1).astype(int)).sort_values('percent_to_goal').reset_index(drop=True)
rdd_df['percent_to_goal*threshold'] = rdd_df['percent_to_goal'] * rdd_df['threshold']

Naive Model

In [None]:
model = smf.wls('total_successful_after_first ~ percent_to_goal + threshold + percent_to_goal*threshold', rdd_df).fit()

model.summary().tables[1]

In [None]:
# Remove outliers
plotting_df = generate_plotting_df('percent_to_goal', 1, 20)


ax = plotting_df.plot.scatter(x = 'percent_to_goal', y = 'total_successful_after_first', color = "C0")

# Convert threshold to int
plotting_df['threshold'] = plotting_df['threshold'].astype(int)
plotting_df['percent_to_goal*threshold'] = plotting_df['percent_to_goal'] * plotting_df['threshold']

exog_dict = {"percent_to_goal": plotting_df['percent_to_goal'].values, "threshold": plotting_df['threshold'].values, "percent_to_goal*threshold": plotting_df['percent_to_goal*threshold'].values}
plotting_df['predictions'] = model.predict(exog_dict)
left_plotting_df = plotting_df[plotting_df['percent_to_goal'] < 1]
right_plotting_df = plotting_df[plotting_df['percent_to_goal'] >= 1]
left_plotting_df.plot(x = 'percent_to_goal', y = 'predictions', color = 'C1', ax = ax)
right_plotting_df.plot(x = 'percent_to_goal', y = 'predictions', color = 'C1', ax = ax)
ax.axvline(x = 1, color = 'red', linestyle = '--')
plt.title('Regression Discontinuity')

Kernel weighted RD

In [None]:
def kernel(R, c, h):
    indicator = (np.abs(R - c) <= h).astype(float)
    return indicator * (1 - np.abs(R - c) / h)

In [None]:
plt.plot(sorted(rdd_df["percent_to_goal"]), kernel(rdd_df["percent_to_goal"], c=1, h=1))
plt.xlabel("Percent to Goal")
plt.ylabel("Weight")
plt.xlim(-2, 4)
plt.title("Kernel Weight by Goal Percentage");

In [None]:
model = smf.wls("total_successful_after_first ~ percent_to_goal*threshold", data = rdd_df, weights = kernel(rdd_df['percent_to_goal'], c=1, h=1)).fit()

model.summary().tables[1]

In [None]:
ax = plotting_df.plot.scatter(x = 'percent_to_goal', y = 'total_successful_after_first', color = "C0")

# Convert threshold to int
plotting_df['threshold'] = plotting_df['threshold'].astype(int)
plotting_df['percent_to_goal*threshold'] = plotting_df['percent_to_goal'] * plotting_df['threshold']

exog_dict = {"percent_to_goal": plotting_df['percent_to_goal'].values, "threshold": plotting_df['threshold'].values, "percent_to_goal*threshold": plotting_df['percent_to_goal*threshold'].values}
plotting_df['predictions'] = model.predict(exog_dict)
left_plotting_df = plotting_df[plotting_df['percent_to_goal'] < 1]
right_plotting_df = plotting_df[plotting_df['percent_to_goal'] >= 1]
left_plotting_df.plot(x = 'percent_to_goal', y = 'predictions', color = 'C1', ax = ax)
right_plotting_df.plot(x = 'percent_to_goal', y = 'predictions', color = 'C1', ax = ax)
ax.axvline(x = 1, color = 'red', linestyle = '--')
plt.title('Regression Discontinuity')

In [None]:
plt.figure(figsize=(15,15))


exog_dict = {"percent_to_goal": plotting_df['percent_to_goal'].values, "threshold": plotting_df['threshold'].values, "percent_to_goal*threshold": plotting_df['percent_to_goal*threshold'].values}
plotting_df['threshold'] = plotting_df['threshold'].astype(int)
plotting_df['percent_to_goal*threshold'] = plotting_df['percent_to_goal'] * plotting_df['threshold']

for p, cause in enumerate(["total_successful_after_first", "total_failed_after_first", "total_canceled_after_first", "total_campaigns_after_first", "total_raised_after_first"], 1):
    ax = plt.subplot(5,1,p)
    ax = plotting_df.plot.scatter(x = 'percent_to_goal', y = cause, color = "C0", ax = ax)

    m = smf.wls(f"{cause}~percent_to_goal*threshold", rdd_df, weights = kernel(rdd_df['percent_to_goal'], c=1, h=1)).fit()
    ate_pct = 100*((m.params["threshold"] + m.params["Intercept"])/m.params["Intercept"] - 1)

    plt.title(f"Impact of Kickstarter on {cause}: {np.round(ate_pct, 2)}%")
    # Convert threshold to int

    plotting_df['predictions'] = m.predict(exog_dict)
    left_plotting_df = plotting_df[plotting_df['percent_to_goal'] < 1]
    right_plotting_df = plotting_df[plotting_df['percent_to_goal'] >= 1]
    left_plotting_df.plot(x = 'percent_to_goal', y = 'predictions', color = 'C1', ax = ax)
    right_plotting_df.plot(x = 'percent_to_goal', y = 'predictions', color = 'C1', ax = ax)
    ax.axvline(x = 1, color = 'red', linestyle = '--')
    plt.title('Regression Discontinuity')
    plt.legend()

    plt.xlim(0, 2)
    
    

plt.tight_layout()

## Dollars to Goal

In [None]:
rdd_df = df.assign(threshold = (df['dollars_to_goal'] > 1).astype(int)).sort_values('dollars_to_goal').reset_index(drop=True)
rdd_df['dollars_to_goal*threshold'] = rdd_df['dollars_to_goal'] * rdd_df['threshold']

Naive Model

In [None]:
model = smf.wls('total_successful_after_first ~ dollars_to_goal + threshold + dollars_to_goal*threshold', rdd_df).fit()

model.summary().tables[1]

# McCrary Test

In [None]:
bins = np.linspace(0, 2, 20)

rdd_df['bin'] = pd.cut(rdd_df['dollars_to_goal'], bins)

# Get Bin Counts
bin_counts = rdd_df['bin'].value_counts(sort = False)

# Calculate the midpoints for each bin
midpoints = bins[:-1] + (bins[1:] - bins[:-1]) / 2

# Plot the bar plot
plt.bar(midpoints, bin_counts, width=(bins[1] - bins[0]), align='center')
plt.xlabel('Percent to Goal')
plt.ylabel('Count')
plt.title('Binned Data with Midpoints')
plt.show()


# RD Analysis with Optimal Bandwidth (IK)

Set Thresholds

In [None]:
perc_threshold = 1
abs_threshold = 0

Identify Optimal Bandwidth

In [None]:
bandwidth_opt_perc = rdd.optimal_bandwidth(X = rdd_df['percent_to_goal'], Y = rdd_df['total_successful_after_first'], cut=perc_threshold)
bandwidth_opt_abs = rdd.optimal_bandwidth(X = rdd_df['dollars_to_goal'], Y = rdd_df['total_successful_after_first'], cut=abs_threshold)
print("Optimal Percentage bandwidth:", bandwidth_opt_perc)
print("Optimal Absolute bandwidth:", bandwidth_opt_abs)

Restrict Data to Optimal Bandwidth

In [None]:
perc_rdd_df = rdd.truncated_data(rdd_df, 'percent_to_goal', bandwidth_opt_perc, cut=perc_threshold)

#abs_rdd_df = rdd.truncated_data(rdd_df, 'dollars_to_goal', bandwidth_opt_abs, cut=abs_threshold)
abs_rdd_df = rdd.truncated_data(rdd_df, 'dollars_to_goal', 10000, cut=abs_threshold)

print(f"Percentage RDD Shape: {perc_rdd_df.shape}")
print(f"Absolute RDD Shape: {abs_rdd_df.shape}")



## Dollars to Goal

In [None]:
model = smf.wls("total_successful_after_first ~ dollars_to_goal*threshold", data = abs_rdd_df, weights = kernel(abs_rdd_df['dollars_to_goal'], c=abs_threshold, h=bandwidth_opt_abs)).fit()

model.summary().tables[1]


In [None]:
ate_pct = 100*((model.params["threshold"] + model.params["Intercept"])/model.params["Intercept"] - 1)
print(f"Impact of Winning First Kickstarter on Total Successful Projects: {np.round(ate_pct, 2)}%")

## Percent to Goal

In [None]:
model = smf.wls("total_successful_after_first ~ percent_to_goal*threshold", data = perc_rdd_df, weights = kernel(perc_rdd_df['percent_to_goal'], c=perc_threshold, h=bandwidth_opt_perc)).fit()

model.summary().tables[1]


In [None]:
ate_pct = 100*((model.params["threshold"] + model.params["Intercept"])/model.params["Intercept"] - 1)
print(f"Impact of Winning First Kickstarter on Total Successful Projects: {np.round(ate_pct, 2)}%")