In [None]:
#Imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

#### Objective: Build a model to predict the State of a Kickstarter project, which can be one of two classes: "Failed" or "Successful".

### Step 1: Data Collection  & Initial Cleaning

In [None]:
df = pd.read_csv('../data/kickstarter_projects.csv')

In [None]:
# Basic Overview
print(df.shape)
print(df.info())

In [None]:
# Standardize column names
df.columns = df.columns.str.lower()
print("Column names:", df.columns.tolist())

In [None]:
# Check duplicates
print("Number of duplicate rows:", df.duplicated().sum())

In [None]:
# Show state distribution (before filtering)
state_dist = df['state'].value_counts(normalize=True) * 100
print("Initial state distribution (%):\n", round(state_dist, 2))

In [None]:
# Filter only for 'Successful' and 'Failed' projects
df = df[df['state'].isin(['Successful', 'Failed'])]
print("After filtering:", df.shape)
print("Filtered state distribution (%):\n", round(df['state'].value_counts(normalize=True) * 100, 2))

### Step 2: Exploratory Data Analysis (EDA)

In [None]:
# 1. State distribution
plt.figure(figsize=(8, 4))
sns.countplot(x='state', data=df)
plt.title('Distribution of Kickstarter Project States')
plt.xlabel('Project State')
plt.ylabel('Number of Projects')
plt.show()

# 2. Goal by State
plt.figure(figsize=(10, 5))
sns.boxplot(x='state', y='goal', data=df)
plt.title('Goal Amount by Project State')
plt.show()

# 3. Category vs State
plt.figure(figsize=(14, 6))
sns.countplot(x='category', hue='state', data=df, order=df['category'].value_counts().index)
plt.title('Project State by Category')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Duration Feature Engineering
df['launched'] = pd.to_datetime(df['launched'])
df['deadline'] = pd.to_datetime(df['deadline'])
df['duration'] = (df['deadline'] - df['launched']).dt.days

print("Campaign Duration Summary:\n", round(df['duration'].describe(), 2))


In [None]:
# Duration by State
plt.figure(figsize=(10, 5))
sns.boxplot(x='state', y='duration', data=df)
plt.title('Campaign Duration by Project State')
plt.show()

In [None]:
# Summary stats for numerical columns
print("Summary statistics for goal, pledged, and backers:")
print(df[['goal', 'pledged', 'backers']].describe())

In [None]:
# Check for data quality issues
print("Rows with goal <= 0:", len(df[df['goal'] <= 0]))
print("Rows with duration == 0:", len(df[df['duration'] == 0]))

In [None]:
# Remove rows where goal is less than or equal to 0 OR duration is 0
df = df[(df['goal'] > 0) & (df['duration'] != 0)]

# Confirm the new shape
print("New shape after removing invalid rows:", df.shape)


In [None]:
# Save filtered dataset
df.to_csv('../data/kickstarter_common.csv', index=False)

In [None]:
# Reload from CSV
df = pd.read_csv('../data/kickstarter_common.csv')
print("Reloaded shape:", df.shape)
print("Reloaded state distribution (%):\n", round(df['state'].value_counts(normalize=True) * 100, 2))

In [None]:
df.head(5)