# Descriptive Statistics

In [1]:
# Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Show statistics in non-scientific format
pd.options.display.float_format = '{:.2f}'.format

## Import Data

In [2]:
# Load the data
data = pd.read_json('../data/kickstarter.json', lines=True)
print(data.shape)

(429463, 129)


## Check number of NAs in each column

In [None]:
# Get number of na values in each column
print(data.isna().sum())

## Raw Data Descriptive Statistics

### Kickstarter by creation, launch and deadline

In [3]:
# First Kickstarter campaign date
first_kickstarter = data['launched_at'].min()
last_kickstarter = data['launched_at'].max()

print(f"First Kickstarter campaign: {first_kickstarter}")
print(f"Last Kickstarter campaign: {last_kickstarter}")

First Kickstarter campaign: 2009-04-24 19:52:03
Last Kickstarter campaign: 2024-10-09 12:55:10


In [None]:
# Show statistics of data by created year, launched year, and deadline year
data['created_at'] = pd.to_datetime(data['created_at'], unit='s')
data['launched_at'] = pd.to_datetime(data['launched_at'], unit='s')
data['deadline'] = pd.to_datetime(data['deadline'], unit='s')

data['created_year'] = data['created_at'].dt.year
data['launched_year'] = data['launched_at'].dt.year
data['deadline_year'] = data['deadline'].dt.year

# Show in one dataframe
years = pd.concat([data['created_year'].value_counts(), data['launched_year'].value_counts(), data['deadline_year'].value_counts()], axis = 1, keys=['Created Year', 'Launched Year', 'Deadline Year'])
years.sort_index()

In [None]:
# Show in one plot
years.sort_index().plot(kind='bar', figsize=(15, 5))
plt.title('Number of Projects by Year')
plt.xlabel('Year')
plt.ylabel('Number of Projects')
plt.show()



### Kickstarters by Project State

In [None]:
# Get total number of projects
total_projects = len(data)
print(f'Total number of projects: {total_projects}')
print('\n')

# Total number of projects by state
projects_by_state = pd.concat([data['state'].value_counts(), data['state'].value_counts(normalize = True) * 100], axis = 1, keys=['Total', 'Percentage'])
print(projects_by_state)

### Kickstarters by Country

In [None]:
country_data = pd.concat([data['country'].value_counts(), data['country'].value_counts(normalize = True) * 100], axis = 1, keys=['Total', 'Percentage'])
print(country_data)

### Kickstarters by Category

In [None]:
data['category_name'].value_counts().sort_index()

### Kickstarters by Dollar Goal Bins

In [None]:
data['pledged_usd_static'] = data['pledged'] * data['static_usd_rate']
data['goal_usd_static'] = data['goal'] * data['static_usd_rate']

bins = [0, 10, 100, 1000, 10000, 100000, 1000000, np.inf]

data['Goal Bins (in US$)'] = pd.cut(data['goal_usd_static'], bins, labels = ['$1-$10', '$10-$100', '$100-$1k', '$1k-$10k', '$10k-$100k', '$100k-$1M', '$1M+'])
# Also create a percentage of total in each bin
bins = pd.concat([data['Goal Bins (in US$)'].value_counts(), data['Goal Bins (in US$)'].value_counts(normalize = True) * 100], axis = 1, keys=['Total', 'Percentage'])
print(bins.sort_index())

### Kickstarter by Average Duration

In [None]:
# Calculate the average duration of Kickstarter
data['duration'] = (data['deadline'] - data['launched_at']).dt.days

# Calculate the average duration of Kickstarter by state
duration_by_state = data.groupby('category_name')['duration']
duration_by_state = pd.concat([duration_by_state.mean(), duration_by_state.median(), duration_by_state.min(), duration_by_state.max()], axis = 1, keys=['Mean', 'Median', 'Minimum', 'Maximum'])
print(duration_by_state)

## First Project Descriptive Statistics

In [None]:
#Import Data
creator_data = pd.read_json('../data/creator_first_project.json', lines = True)
creator_data.shape

### Creators by Launch Year

In [None]:
# Get number of creators by creation year, launch year, and deadline year
date_columns = ['launched_at', 'created_at', 'deadline']
creator_data[date_columns] = creator_data[date_columns].apply(pd.to_datetime, unit='s')

creator_data['launched_year'] = creator_data['launched_at'].dt.year
creator_data['created_year'] = creator_data['created_at'].dt.year
creator_data['deadline_year'] = creator_data['deadline'].dt.year

# Show in one dataframe
years = pd.concat([creator_data['launched_year'].value_counts(), creator_data['created_year'].value_counts(), creator_data['deadline_year'].value_counts()], axis = 1, keys=['launched_year', 'created_year', 'deadline_year'])
years.sort_index()


### Creators by Project State

In [None]:
# Get total number of projects
total_projects = len(creator_data)
print(f'Total number of projects: {total_projects}')
print('\n')

# Total number of projects by state
projects_by_state = pd.concat([creator_data['state'].value_counts(), creator_data['state'].value_counts(normalize = True) * 100], axis = 1, keys=['Total', 'Percentage'])
print(projects_by_state)

### Creators by Country

In [None]:
country_data = pd.concat([creator_data['country'].value_counts(), creator_data['country'].value_counts(normalize = True) * 100], axis = 1, keys=['Total', 'Percentage'])
print(country_data)

### Creators by Category

In [None]:
category_data = pd.concat([creator_data['category_name'].value_counts(), creator_data['category_name'].value_counts(normalize = True) * 100], axis = 1, keys=['Total', 'Percentage'])
print(category_data)

### Creators by Dollar Goal Bins

In [None]:
creator_data['pledged_usd_static'] = creator_data['pledged'] * creator_data['static_usd_rate']
creator_data['goal_usd_static'] = creator_data['goal'] * creator_data['static_usd_rate']

bins = [0, 10, 100, 1000, 10000, 100000, 1000000, np.inf]

creator_data['Goal Bins (in US$)'] = pd.cut(creator_data['goal_usd_static'], bins, labels = ['$1-$10', '$10-$100', '$100-$1k', '$1k-$10k', '$10k-$100k', '$100k-$1M', '$1M+'])
# Also create a percentage of total in each bin
bins = pd.concat([creator_data['Goal Bins (in US$)'].value_counts(), creator_data['Goal Bins (in US$)'].value_counts(normalize = True) * 100], axis = 1, keys=['Total', 'Percentage'])
print(bins.sort_index())

### Creators by Average Duration

In [None]:
# Calculate the average duration of Kickstarter
creator_data['duration'] = (creator_data['deadline'] - creator_data['launched_at']).dt.days

# Calculate the average duration of Kickstarter by state
duration_by_state = creator_data.groupby('category_name')['duration']
duration_by_state = pd.concat([duration_by_state.mean(), duration_by_state.median(), duration_by_state.min(), duration_by_state.max()], axis = 1, keys=['Mean', 'Median', 'Minimum', 'Maximum'])
print(duration_by_state)

## Regression Discontinuity Descriptive Statistics

Create Running and Treated Variables

In [None]:
# Create Running Variable
creator_data['dollars_to_goal'] = creator_data['goal_usd_static'] - creator_data['pledged_usd_static']

# Create Treated Variable
creator_data['treated'] = 0
creator_data.loc[creator_data['dollars_to_goal'] >= 0, 'treated'] = 1

Create Dummy for Country

In [None]:
# Create US Dummy Variable
creator_data['us'] = 0
creator_data.loc[creator_data['country'] == 'US', 'us'] = 1


Fill NAs for Staff Pick

In [None]:
# Replace NAs in staff pick with 0
creator_data['staff_pick'] = creator_data['staff_pick'].fillna(0)

Variables for Summary Statistics

In [None]:
sum_vars = ['us', 'staff_pick', 'backers_count', 'duration', 'goal_usd_static', 'pledged_usd_static']

## Summary Statistics by Treated

### Treated vs Control Statistics

In [None]:
treated_statistics = creator_data.groupby('treated')[sum_vars].agg({'count', 'mean', 'std', 'median'}).T
treated_statistics

### Distribution of Goal Amount by Treated

In [None]:
# Loop through each bin and create a separate plot
for bin_range in creator_data['Goal Bins (in US$)'].unique():
    bin_data = creator_data[creator_data['Goal Bins (in US$)'] == bin_range]
    
    # Plot distribution of Goal Amount by Treated for the current bin
    plt.figure(figsize=(8, 6))
    sns.histplot(data=bin_data, x='goal_usd_static', hue='treated', bins=10, kde=False)
    plt.title(f'Distribution of Goal Amount by Treated: {bin_range}')
    plt.xlabel('Goal Amount (in US$)')
    plt.ylabel('Number of Projects')
    plt.show()

### Distribution of Entrpreneurs near cutoff

In [None]:
creator_data['dollars_to_goal_usd'] = creator_data['pledged_usd_static'] - creator_data['goal_usd_static']

In [None]:
# Define bins
dollars_to_goal_bins = [-np.inf, -1000000, -100000, -10000, 10000, 100000, 1000000, np.inf]
creator_data['Dollars to Goal Bins (in US$)'] = pd.cut(
    creator_data['dollars_to_goal_usd'], 
    dollars_to_goal_bins, 
    labels=['-1M+', '-1M-$100k', '-$100k-$10k', '-$10k-$10k', '$10k-$100k', '$100k-$1M', '$1M+']
)

duration_bins = [0, 30, 60, 90, np.inf]
creator_data['Duration Bins'] = pd.cut(
    creator_data['duration'], 
    duration_bins, 
    labels=['0-30', '30-60', '60-90', '90+']
)

# Create a figure and subplots
num_bins = len(dollars_to_goal_bins) - 1  # Number of bins
fig, axes = plt.subplots(4, 2, figsize=(15, 20))  # Create a grid of subplots
axes = axes.flatten()  # Flatten the 2D array of axes for easier indexing

# Loop through each bin and plot
for i, bin_range in enumerate(creator_data['Dollars to Goal Bins (in US$)'].unique().sort_values()):
    bin_data = creator_data[creator_data['Dollars to Goal Bins (in US$)'] == bin_range]
    
    # Plot on the corresponding subplot
    sns.histplot(
        data=bin_data, 
        x='dollars_to_goal_usd', 
        bins=20, 
        kde=False, 
        hue='Goal Bins (in US$)', 
        multiple='stack', 
        ax=axes[i]
    )
    axes[i].set_title(f'Distribution of Dollars to Goal: {bin_range}')
    axes[i].set_xlabel('Dollars to Goal Amount (in US$)')
    axes[i].set_ylabel('Number of Projects')

# Adjust layout
plt.tight_layout()
plt.show()