In [None]:
#Imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import matplotlib.pyplot as plt

import seaborn as sns

warnings.filterwarnings('ignore')

plt.style.use('fivethirtyeight')


In [None]:
df = pd.read_csv("../data/kickstarter_projects.csv")
df.head()


# Convert all column names to lower case
df.columns = df.columns.str.lower()

# Count the number of unique values for each column
unique_counts = df.nunique()

df.columns.value_counts
df.nunique()

In [None]:
df.head()

df.isna().value_counts()
#


In [None]:
df_clean=df

# Convert appropriate columns to correct data types
df_clean['launched'] = pd.to_datetime(df_clean['launched'], errors='coerce')
df_clean['deadline'] = pd.to_datetime(df_clean['deadline'], errors='coerce')
df_clean['goal'] = pd.to_numeric(df['goal'], errors='coerce')
df_clean['pledged'] = pd.to_numeric(df_clean['pledged'], errors='coerce')
df_clean['backers'] = pd.to_numeric(df_clean['backers'], errors='coerce')


# Recalculate pledge_ratio and pledge_backer ratio
df_clean['pledge_ratio'] = df_clean['pledged'] / df_clean['goal']
df_clean['ave_backer'] = df_clean['pledged'] / df_clean['backers']
df_clean['backer_ratio_on_goal'] = (df_clean['goal']*df_clean['backers']) / df_clean['pledged']


In [None]:
df_clean.head()


In [None]:
print(df_clean.info())
df_clean.describe(include='all')


In [None]:
df_clean.isnull().sum()

In [None]:
df_clean[df_clean['pledge_ratio'].isna()]

In [None]:
ax = df_clean['state'].value_counts(normalize=True).plot(kind='barh', title='Project Outcome Distribution')

# Add x-axis label
ax.set_xlabel('Relative Number of Projects in %')

plt.tight_layout()
plt.show()

In [None]:

ax = df_clean['state'].value_counts(normalize=False).plot(
    kind='barh',
    title='Project Outcome Distribution'
    )

# Set custom ticks manually
max_value = df_clean['state'].value_counts().max()
ticks = list(range(0, max_value + 50000, 50000))
labels = [str(int(t / 1000)) for t in ticks]

ax.set_xticks(ticks)
ax.set_xticklabels(labels)

# Add x-axis label
ax.set_xlabel('Number of Projects (in 1,000s)')

plt.tight_layout()
plt.show()

In [None]:
# Select only numeric columns for histogram
numeric_cols = df_clean.select_dtypes(include=['number', 'datetime']).columns

# Drop columns with too many NaNs if necessary (optional)
# numeric_cols = [col for col in numeric_cols if df_clean[col].notna().sum() > 10]

# Plot histograms
df_clean[numeric_cols].hist(bins=30, figsize=(18, 12), edgecolor='black')
plt.suptitle("Histograms of Numeric Features", fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
df_clean.columns

In [None]:
# Set up a vertical stack of 7 plots
fig, axs = plt.subplots(7, 1, figsize=(12, 24))
fig.suptitle("Feature Distributions (Log Scale Where Applicable)", fontsize=20)

# Plot 1: goal
sns.histplot(np.log1p(df_clean['goal']), bins=50, ax=axs[0])
axs[0].set_title('log(1 + Goal)')
axs[0].set_xlabel('log(1 + goal)')

# Plot 2: pledged
sns.histplot(np.log1p(df_clean['pledged']), bins=50, ax=axs[1])
axs[1].set_title('log(1 + Pledged)')
axs[1].set_xlabel('log(1 + pledged)')

# Plot 3: backers
sns.histplot(np.log1p(df_clean['backers']), bins=50, ax=axs[2])
axs[2].set_title('log(1 + Backers)')
axs[2].set_xlabel('log(1 + backers)')

# Plot 4: state (categorical count)
sns.countplot(x='state', data=df_clean, ax=axs[3])
axs[3].set_title('Project States')
axs[3].set_xlabel('State')

# Plot 5: pledge_ratio
sns.histplot(np.log1p(df_clean['pledge_ratio']), bins=500, ax=axs[4])
axs[4].set_title('log(1 + Pledge Ratio)')
axs[4].set_xlabel('log(0.001 + pledge_ratio)')
axs[4].set_xlim(left=1e-3, right=np.log1p(2.0))

# Plot 6: ave_backer
sns.histplot(np.log1p(df_clean['ave_backer']), bins=50, ax=axs[5])
axs[5].set_title('log(1 + Average Pledge per Backer)')
axs[5].set_xlabel('log(1 + ave_backer)')

# Plot 7: backer_ratio_on_goal
sns.histplot(np.log1p(df_clean['backer_ratio_on_goal']), bins=50, ax=axs[6])
axs[6].set_title('log(1 + Backer Ratio on Goal)')
axs[6].set_xlabel('log(1 + backer_ratio_on_goal)')

plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()



In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(np.log1p(df_clean['pledged']), bins=50, edgecolor='black')
plt.title("Log-Scaled Distribution of Pledged Amounts")
plt.xlabel("log(1 + Pledged Amount)")
plt.ylabel("Number of Projects")
plt.tight_layout()
plt.show()

In [None]:
sns.heatmap(df_clean.corr(numeric_only=True), annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix")

#pledged# is highl correlated with "backers" - so, what is the typical average pledge per baker ? I calculated this. And then also the backer ratio on goal... to see, whether higher goals are resulting on higher pledges...

In [None]:
df_clean.head()

In [None]:
#df_clean = df_clean[df_clean['state'].notna()]

#sns.pairplot(df_clean[['goal', 'pledged', 'backers', 'pledge_ratio', 'ave_backer', 'state']], hue='state')
#plt.suptitle("Pairplot by Project State", y=1.02)

In [None]:
df_clean.groupby('state')[['pledge_ratio', 'ave_backer', 'goal']].median().T.plot(kind='bar')
plt.title("Feature Medians by Project State")

In [None]:
sns.boxplot(x='state', y=(df_clean['goal']/df_clean['backers']), data=df_clean)


In [None]:
sns.violinplot(x='state', y='backers', data=df_clean)

In [None]:
top_cats = df_clean['category'].value_counts().head(10)
success_counts = df_clean[df_clean['state'] == 'Successful']['category'].value_counts()

# Reindex success_counts to align with top_cats
success_rate = success_counts.reindex(top_cats.index) / top_cats

# Plot the success rates
success_rate.plot(kind='bar', title='Success Rate of Top Categories')
plt.ylabel("Success Rate")
plt.tight_layout()
plt.show()

In [None]:
df_clean['duration_days'] = (df_clean['deadline'] - df_clean['launched']).dt.days
sns.boxplot(x='state', y='duration_days', data=df_clean)
country_success = df_clean[df_clean['state'] == 'Successful']['country'].value_counts()
country_total = df_clean['country'].value_counts()
(country_success / country_total).sort_values(ascending=False).head(5).plot(kind='bar', title='Success Rate by Country')


In [None]:
df_clean['launch_month'] = df_clean['launched'].dt.to_period('M')
df_clean['launch_month'].value_counts().sort_index().plot(kind='line', title='Projects Launched Over Time')

In [None]:
# Recreate launch_month just in case
df_clean['launch_month'] = pd.to_datetime(df_clean['launched'], errors='coerce').dt.to_period('M')

# Filter successful projects and drop NaNs
successful = df_clean[df_clean['state'] == 'successful'].copy()
successful = successful[successful['launch_month'].notna()]

# Count successful projects per month
success_per_month = successful['launch_month'].value_counts().sort_index()

# Only plot if data exists
if not success_per_month.empty:
    success_per_month.plot(kind='line', title='Successful Projects Over Time')
    plt.xlabel('Launch Month')
    plt.ylabel('Number of Successful Projects')
    plt.tight_layout()
    plt.show()
else:
    print("No successful projects with valid launch dates found.")

In [None]:
df_clean.groupby('category')[['pledged', 'goal']].median().sort_values('pledged').plot(kind='bar', title='Median Pledged/Goal by Category')


In [None]:
pivot = pd.crosstab(df_clean['category'], df_clean['state'], normalize='index')
sns.heatmap(pivot, annot=True, cmap='YlGnBu')

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(np.log1p(df_clean['pledged']), bins=50)
plt.title("Distribution of Log-Transformed Pledged Amounts")
plt.xlabel("log(1 + Pledged Amount)")
plt.ylabel("Number of Projects")
plt.tight_layout()
plt.show()

In [None]:
sns.scatterplot(x='backers', y=np.log1p(df_clean['pledge_ratio']), hue='state', data=df_clean, alpha=0.5)


In [None]:
# Define bins and labels
goal_bins = [0, 100, 1_000, 10_000, 100_000, 1_000_000, df_clean['goal'].max()]
goal_labels = ['<100', '100-1k', '1k–10k', '10k–100k', '100k–1M', '>1M']

# Create new column
df_clean['goal_bin'] = pd.cut(df_clean['goal'], bins=goal_bins, labels=goal_labels, include_lowest=True)
sns.countplot(data=df_clean, x='goal_bin', hue='state')
plt.title("Project Outcomes by Goal Range")
plt.xlabel("Goal Range (log steps)")
plt.ylabel("Number of Projects")
plt.tight_layout()
plt.show()

In [None]:
# Cross-tab and normalize
goal_state_dist = pd.crosstab(df_clean['goal_bin'], df_clean['state'], normalize='index')

# Plot as stacked bar
goal_state_dist.plot(kind='bar', stacked=True, colormap='viridis')
plt.title("Project Outcome Proportion by Goal Bin")
plt.xlabel("Goal Range (log steps)")
plt.ylabel("Proportion of Outcomes")
plt.legend(title='State', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Define log-scaled goal bins
goal_bins = [0, 1_000, 10_000, 100_000, 1_000_000, df_clean['goal'].max()]
goal_labels = ['<1k', '1k–10k', '10k–100k', '100k–1M', '>1M']

# Create the binned column (if not already created)
df_clean['goal_bin'] = pd.cut(df_clean['goal'], bins=goal_bins, labels=goal_labels, include_lowest=True)

# Remove NaNs and extreme pledge_ratio outliers (optional but recommended)
df_box = df_clean[df_clean['pledge_ratio'].notna() & df_clean['goal_bin'].notna()]
df_box = df_box[df_box['pledge_ratio'] < 10]  # Filter for visibility

# Create the boxplot
plt.figure(figsize=(10, 6))
sns.boxplot(data=df_box, x='goal_bin', y='pledge_ratio', hue='state')
plt.title('Pledge Ratio Distribution by Goal Bin and State')
plt.xlabel('Goal Range (log steps)')
plt.ylabel('Pledge Ratio (Pledged / Goal)')
plt.legend(title='State', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
df_box = df_clean[df_clean['pledge_ratio'].notna() & df_clean['goal_bin'].notna()]
df_box = df_box[df_box['pledge_ratio'] <= 2]

# Define goal bins
goal_bins = [0, 1_000, 10_000, 100_000, 1_000_000, df_clean['goal'].max()]
goal_labels = ['<1k', '1k–10k', '10k–100k', '100k–1M', '>1M']
df_clean['goal_bin'] = pd.cut(df_clean['goal'], bins=goal_bins, labels=goal_labels, include_lowest=True)

# Filter data and outliers
df_box = df_clean[df_clean['pledge_ratio'].notna() & df_clean['goal_bin'].notna()]
df_box = df_box[df_box['pledge_ratio'] <= 2]

# Plot
plt.figure(figsize=(10, 6))
plt.ylim(0, 2)
sns.boxplot(data=df_box, x='goal_bin', y='pledge_ratio', hue='state')
plt.ylim(0, 2)
plt.title('Pledge Ratio by Goal Bin and State (Capped at 2)')
plt.xlabel('Goal Range (log steps)')
plt.ylabel('Pledge Ratio (Pledged / Goal)')
plt.legend(title='State', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Ensure 'launch_month' is correct
df_clean['launch_month'] = pd.to_datetime(df_clean['launched'], errors='coerce').dt.to_period('M')

# Group by month
grouped = df_clean.groupby('launch_month')

# Metrics
total_projects = grouped.size()
total_goal = grouped['goal'].sum()
successful_projects = grouped['state'].apply(lambda x: (x == 'Successful').sum())
failed_projects = grouped['state'].apply(lambda x: (x == 'Failed').sum())

# Convert PeriodIndex to datetime for plotting
x = total_projects.index.to_timestamp()

# Plot
fig, ax1 = plt.subplots(figsize=(14, 7))

# Left y-axis
ax1.plot(x, total_projects, label='Total Projects', color='tab:blue')
ax1.plot(x, successful_projects, label='Successful Projects', color='tab:green')
ax1.plot(x, failed_projects, label='Failed Projects', color='tab:red')
ax1.set_ylabel("Number of Projects")
ax1.set_xlabel("Launch Month")
ax1.tick_params(axis='y')
ax1.legend(loc='upper left')

# Right y-axis for total goal amount
ax2 = ax1.twinx()
ax2.plot(x, total_goal, label='Total Goal Amount', color='tab:orange', linestyle='--')
ax2.set_ylabel("Total Goal Amount (Currency)")
ax2.tick_params(axis='y', labelcolor='tab:orange')

# Combine legends
lines, labels = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines + lines2, labels + labels2, loc='upper center')

plt.title("Kickstarter Project Metrics Over Time")
plt.tight_layout()
plt.show()

In [None]:
###
#duration_days
#pledge_ratio
#ave_backer
#backer_ratio_on_goal
###

#### Objective: Build a model to predict the State of a Kickstarter project, which can be one of two classes: "Failed" or "Successful".

### Step 1: Data Collection  & Initial Cleaning

In [None]:
df = pd.read_csv('../data/kickstarter_projects.csv')

In [None]:
# Basic Overview
print(df.shape)
print(df.info())

In [None]:
# Standardize column names
df.columns = df.columns.str.lower()
print("Column names:", df.columns.tolist())

In [None]:
# Check duplicates
print("Number of duplicate rows:", df.duplicated().sum())

In [None]:
# Show state distribution (before filtering)
state_dist = df['state'].value_counts(normalize=True) * 100
print("Initial state distribution (%):\n", round(state_dist, 2))

In [None]:
# Filter only for 'Successful' and 'Failed' projects
df = df[df['state'].isin(['Successful', 'Failed'])]
print("After filtering:", df.shape)
print("Filtered state distribution (%):\n", round(df['state'].value_counts(normalize=True) * 100, 2))

### Step 2: Exploratory Data Analysis (EDA)

In [None]:
# 1. State distribution
plt.figure(figsize=(8, 4))
sns.countplot(x='state', data=df)
plt.title('Distribution of Kickstarter Project States')
plt.xlabel('Project State')
plt.ylabel('Number of Projects')
plt.show()

# 2. Goal by State
plt.figure(figsize=(10, 5))
sns.boxplot(x='state', y='goal', data=df)
plt.title('Goal Amount by Project State')
plt.show()

# 3. Category vs State
plt.figure(figsize=(14, 6))
sns.countplot(x='category', hue='state', data=df, order=df['category'].value_counts().index)
plt.title('Project State by Category')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Duration Feature Engineering
df['launched'] = pd.to_datetime(df['launched'])
df['deadline'] = pd.to_datetime(df['deadline'])
df['duration'] = (df['deadline'] - df['launched']).dt.days

print("Campaign Duration Summary:\n", round(df['duration'].describe(), 2))


In [None]:
# Duration by State
plt.figure(figsize=(10, 5))
sns.boxplot(x='state', y='duration', data=df)
plt.title('Campaign Duration by Project State')
plt.show()

In [None]:
# Summary stats for numerical columns
print("Summary statistics for goal, pledged, and backers:")
print(df[['goal', 'pledged', 'backers']].describe())

In [None]:
# Check for data quality issues
print("Rows with goal <= 0:", len(df[df['goal'] <= 0]))
print("Rows with duration == 0:", len(df[df['duration'] == 0]))

In [None]:
# Remove rows where goal is less than or equal to 0 OR duration is 0
df = df[(df['goal'] > 0) & (df['duration'] != 0)]

# Confirm the new shape
print("New shape after removing invalid rows:", df.shape)


In [None]:
# Save filtered dataset
df.to_csv('../data/kickstarter_common.csv', index=False)

In [None]:
# Reload from CSV
df = pd.read_csv('../data/kickstarter_common.csv')
print("Reloaded shape:", df.shape)
print("Reloaded state distribution (%):\n", round(df['state'].value_counts(normalize=True) * 100, 2))

In [None]:
df.head(5)