# Load and Inspect the Data

In [None]:
#Imports
import pandas as pd
import numpy as np
import warnings
import seaborn as sns
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')

df = pd.read_csv('../data/kickstarter_projects.csv')

df.columns = [col.lower() for col in df.columns]

print(df.head())
print(df.info())
print(df.describe())

## Key Questions to Explore
### Project Success Rate:

In [None]:
state_counts = df['state'].value_counts()
plt.pie(state_counts, labels=state_counts.index, autopct='%1.1f%%')
plt.title('Project State Distribution')
plt.show()

### Funding Goals vs. Pledged Amounts:

* Use logarithmic scales due to wide ranges; look for patterns in successful projects.

In [None]:
sns.scatterplot(x='goal', y='pledged', hue='state', data=df)
plt.xscale('log')
plt.yscale('log')
plt.title('Goal vs. Pledged by State')
plt.show()

### Category Insights:

In [None]:
success_by_category = df[df['state'] == 'Successful']['category'].value_counts()
total_by_category = df['category'].value_counts()
success_rate = (success_by_category / total_by_category).sort_values(ascending=False)
success_rate.plot(kind='bar', title='Success Rate by Main Category')
plt.show()

### Time Trends:

In [None]:
df['launched'] = pd.to_datetime(df['launched'])
df['year'] = df['launched'].dt.year
sns.countplot(x='year', hue='state', data=df)
plt.title('Projects by Year and State')
plt.show()

In [None]:
df.columns

## Step 1: Data Preparation
First, let’s clean and prepare the data to focus on success factors. Assuming you’re using the ks-projects-201801.csv dataset from Kaggle, here’s how to start:

* Notes:
** We drop pledged because it’s a result of success, not a predictor. We’ll use goal instead.
** If there are missing values (e.g., in usd_goal_real), we might drop those rows or impute them later.

In [None]:
# Filter to only 'Successful' and 'Failed' projects (exclude 'live', 'canceled', etc.)
df = df[df['state'].isin(['Successful', 'Failed'])]

In [None]:
# Convert dates and calculate campaign duration
df['launched'] = pd.to_datetime(df['launched'])
df['deadline'] = pd.to_datetime(df['deadline'])
df['duration_days'] = (df['deadline'] - df['launched']).dt.days

In [None]:
# Create a binary target: 1 for successful, 0 for failed
df['success'] = (df['state'] == 'Successful').astype(int)

In [None]:
# Drop unnecessary columns (e.g., 'ID', 'name' for now)
df = df.drop(columns=['id', 'name', 'state', 'pledged'])  # Pledged is outcome-related, not a predictor

# Check for missing values
print(df.isnull().sum())

## Step 2: Feature Exploration
Let’s examine key features that might influence success: goal, category, main_category, country, currency, duration_days, and launched (time-based factors). We’ll visualize and analyze each.

a. Funding Goal (goal and usd_goal_real)
Higher goals might be harder to achieve. Let’s compare distributions:

In [None]:
df.info()
df['state'].head(5)

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='success', y='goal')  # Exclude outliers for clarity
plt.yscale('log')
plt.title('USD Goal Distribution by Success')
plt.show()

# Median goals by success
print(df.groupby('success')['goal'].median())

b. Category (main_category)
Some categories might inherently perform better:

In [None]:
plt.figure(figsize=(12, 6))
success_rate = df.groupby('category')['success'].mean().sort_values(ascending=False)
sns.barplot(x=success_rate.index, y=success_rate.values)
plt.xticks(rotation=45)
plt.title('Success Rate by Main Category')
plt.ylabel('Success Rate')
plt.show()

print(df.groupby('category')['success'].mean().sort_values(ascending=False))

Expectation: Categories like Music or Art might have higher success rates due to lower goals or broader appeal.

c. Campaign Duration (duration_days)
Longer campaigns might signal uncertainty, or shorter ones might create urgency:

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='duration_days', hue='success', bins=30, alpha=0.5)
plt.title('Campaign Duration by Success')
plt.show()

# Median duration by success
print(df.groupby('success')['duration_days'].median())

Expectation: Successful projects might favor shorter durations (e.g., 30 days).

d. Country (country)
Geographic differences could reflect market size or crowdfunding culture:

In [None]:
plt.figure(figsize=(12, 6))
success_rate_country = df.groupby('country')['success'].mean().sort_values(ascending=False)
sns.barplot(y=success_rate_country.index, x=success_rate_country.values, orient='h')
plt.title('Success Rate by Country')
plt.show()

print(success_rate_country)

e. Launch Timing (launched)
Seasonality or year might matter:

In [None]:
df['launch_month'] = df['launched'].dt.month
df['launch_year'] = df['launched'].dt.year

plt.figure(figsize=(10, 6))
sns.barplot(x='launch_month', y='success', data=df)
plt.title('Success Rate by Launch Month')
plt.show()

print(df.groupby('launch_month')['success'].mean().sort_values(ascending=False))

## Step 3: Correlation and Feature Importance
To quantify importance, let’s use a simple machine learning model (e.g., Random Forest) to rank features. First, encode categorical variables:

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report

# Select features
features = ['goal', 'category', 'country', 'duration_days', 'launch_month', 'launch_year']
X = df[features].copy()
y = df['success']

# Encode categorical variables
le = LabelEncoder()
X['category'] = le.fit_transform(X['category'])
X['country'] = le.fit_transform(X['country'])

# # Handle any NaN values (simple imputation)
X = X.fillna(X.median())

# Train/test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43)

# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict on test set
y_pred = rf.predict(X_test)

# Feature importance
importances = pd.Series(rf.feature_importances_, index=features).sort_values(ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(x=importances.values, y=importances.index)
plt.title('Feature Importance for Project Success')
plt.show()
print(importances)


# Calculate metrics
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

# Print results
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1-Score: {f1:.3f}")
print(f"Accuracy: {accuracy:.3f}")

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Failed', 'Successful']))

##  Digging Deeper

* Duration: Bucket into ranges (<20, 20-40, >40 days) to check if extremes matter:

In [None]:
df['duration_bucket'] = pd.cut(df['duration_days'], bins=[0, 20, 40, 100], labels=['Short (0-20)', 'Medium (20-40)', 'Long(40+)'])
sns.barplot(x='duration_bucket', y='success', data=df)
plt.show()

* Goal + Category: Test if low goals in low-success categories (e.g., Tech) still fail, or if goal trumps all:

In [None]:
sns.boxplot(x='category', y='goal', hue='success', data=df, showfliers=False)
plt.xticks(rotation=90)
plt.show()

In [None]:
heatmapdata = df['pledged']

sns.heatmap(df)