# Importing necessary libraries

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import json
import os
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, 
                             f1_score, roc_auc_score, confusion_matrix, 
                             classification_report, roc_curve)
import matplotlib.pyplot as plt
import seaborn as sns


# Data loading and exploration

## Define the merge function


The following code block will:
- Find all CSV files in the data directory
- Read each CSV file into a pandas DataFrame
- Combine all DataFrames into one


In [None]:

# Get the path to the data directory using Path for cross-platform compatibility
data_path = Path('data')

# Find all CSV files in the directory and sort them for consistent ordering
# glob('*.csv') finds all files ending in .csv
csv_files = sorted(data_path.glob('*.csv'))

if not csv_files:
    raise ValueError(f"No CSV files found in {'data'}")

print(f"Found {len(csv_files)} CSV files to merge...")

# Read all CSV files into a list of DataFrames
# We'll store each DataFrame in a list before concatenating
dataframes = []
for csv_file in csv_files:
    print(f"Reading {csv_file.name}...")
    try:
        # pd.read_csv() reads the CSV file and converts it to a DataFrame
        df = pd.read_csv(csv_file)
        dataframes.append(df)
    except Exception as e:
        # If a file can't be read, print a warning but continue with other files
        print(f"Warning: Could not read {csv_file.name}: {e}")
        continue

if not dataframes:
    raise ValueError("No dataframes were successfully loaded")

# Concatenate all DataFrames into one
# ignore_index=True creates a new sequential index (0, 1, 2, ...) instead of keeping original indices
print("Merging dataframes...")
merged_df = pd.concat(dataframes, ignore_index=True)

# Display summary information about the merged DataFrame
print(f"\nMerged DataFrame shape: {merged_df.shape}")
print(f"Total rows: {len(merged_df)}")
print(f"Total columns: {len(merged_df.columns)}")


## Explore the merged data

Let's take a look at the structure and content of our merged DataFrame.


In [None]:
# Display the first few rows to see what the data looks like
# head() shows the first 5 rows by default
merged_df.head()


In [None]:
# Get basic information about the DataFrame
# info() shows column names, data types, and non-null counts
merged_df.info()


In [None]:
# Display basic statistics for numeric columns
# describe() provides count, mean, std, min, max, and quartiles
merged_df.describe()


# Classification Model

This section of the notebook builds a classification model to predict whether a Kickstarter project will be "successful" or "failed" using only features available at project launch.


## Techniques and Algorithms Used


1. Random Forest Classifier
   - Bagged ensemble of decision trees
   - Each tree uses random sample with replacement
   - Final prediction is mean score across trees

2. GridSearchCV for Hyperparameter Tuning
   - Examines all possible combinations of specified hyperparameters
   - Uses cross-validation to find optimal parameters

3. 5-Fold Cross-Validation
   - Splits training data into k parts
   - Each fold serves as validation data once

4. One-Hot Encoding
   - Encodes categorical variables with boolean variables (0/1)

5. Classification Metrics
   - Confusion Matrix, Accuracy, Precision, Recall, F1, ROC-AUC

6. Feature Importance
   - Understanding which features are most impactful

7. Train-Test Split
   - 80/20 split with stratification

## Feature Engineering

Only features available at project LAUNCH are used. This ensures the model can make predictions when a project is launched, not after it has already received funding.

Excluded FEATURES (NOT available at launch):
- `backers_count`: Number of backers (only known after launch)
- `pledged`: Amount pledged (only known after launch)
- `staff_pick`: Whether Kickstarter staff featured the project (selected after launch)
- `percent_funded`: Funding percentage (only known after launch)
- `usd_pledged`: USD pledged (only known after launch)
- `spotlight`: Spotlight status (assigned after campaign ends)
- `state_changed_at`: When state changed (after campaign ends)

In [None]:
# 1. Extract parent category from JSON
def extract_parent_category(cat_str):
    """Extract the parent category from the nested JSON category field."""
    try:
        cat_dict = json.loads(cat_str.replace('""', '"'))
        return cat_dict.get('parent_name', 'Unknown')
    except:
        return 'Unknown'

df['parent_category'] = df['category'].apply(extract_parent_category)

In [None]:
# 2. Campaign duration (in days)
# Calculated from deadline and launched_at timestamps
df['campaign_duration_days'] = (df['deadline'] - df['launched_at']) / (60 * 60 * 24)

# 3. Has video (binary: 1 if project has video, 0 otherwise)
df['has_video'] = (~df['video'].isna()).astype(int)

# 4. Blurb length (character count of project description)
df['blurb_length'] = df['blurb'].fillna('').apply(len)

# 5. Name length (character count of project name)
df['name_length'] = df['name'].fillna('').apply(len)

# 6. Goal in USD (standardized funding goal)
df['goal_usd'] = df['goal'] * df['static_usd_rate']

# 7. Is US-based project
df['is_us'] = (df['country'] == 'US').astype(int)

# 8. Log transform of goal (to handle right-skewed distribution)
df['log_goal_usd'] = np.log1p(df['goal_usd'])

# Create target variable (1 = successful, 0 = failed)
df['target'] = (df['state'] == 'successful').astype(int)

print("\nFeatures Created (All available at launch):")
print("-" * 50)
print("1. log_goal_usd:         Log of funding goal in USD")
print("2. campaign_duration_days: Length of campaign in days")
print("3. blurb_length:         Character count of description")
print("4. name_length:          Character count of project name")
print("5. has_video:            Whether project has a video (0/1)")
print("6. is_us:                Whether US-based project (0/1)")
print("7. parent_category:      Main category (one-hot encoded)")

## Prepare Features and Target

In [None]:
# Numerical features
numerical_features = [
    'log_goal_usd',           # Log of funding goal
    'campaign_duration_days', # Campaign length
    'blurb_length',           # Description length
    'name_length',            # Project name length
]

# Binary features
binary_features = [
    'has_video',    # Has video presentation
    'is_us',        # US-based project
]

# Categorical features - will be one-hot encoded
categorical_features = ['parent_category']

# Prepare the feature matrix
X_num = df[numerical_features].copy()
X_bin = df[binary_features].astype(int).copy()
X_cat = pd.get_dummies(df[categorical_features], drop_first=True)  # drop_first to avoid multicollinearity

# Combine all features
X = pd.concat([X_num, X_bin, X_cat], axis=1)
y = df['target']

print(f"\nFeature Matrix Shape: {X.shape}")
print(f"Number of samples: {X.shape[0]}")
print(f"Number of features: {X.shape[1]}")
print("\nFeature names:")
for i, col in enumerate(X.columns):
    print(f"  {i+1:2d}. {col}")

## Train-Test Split

In [None]:
# Using 80/20 split with stratification to maintain class proportions
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y  # Maintain class balance in both sets
)

print(f"\nTraining set: {X_train.shape[0]} samples ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"Test set:     {X_test.shape[0]} samples ({X_test.shape[0]/len(X)*100:.1f}%)")
print(f"\nTraining class distribution:")
print(f"  Successful: {(y_train == 1).sum()} ({(y_train == 1).mean()*100:.1f}%)")
print(f"  Failed:     {(y_train == 0).sum()} ({(y_train == 0).mean()*100:.1f}%)")


## Baseline Model Comparison

### Logistic Regression

In [None]:
# Scale features for Logistic Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


lr = LogisticRegression(random_state=42, max_iter=1000)
lr_cv_scores = cross_val_score(lr, X_train_scaled, y_train, cv=5, scoring='roc_auc')
print(f"1. Logistic Regression:")
print(f"   CV AUC: {lr_cv_scores.mean():.4f} (+/- {lr_cv_scores.std():.4f})")

### Decision Tree

In [None]:
# Scale features for Logistic Regression
dt = DecisionTreeClassifier(random_state=42, max_depth=5)
dt_cv_scores = cross_val_score(dt, X_train, y_train, cv=5, scoring='roc_auc')
print(f"\n2. Decision Tree (max_depth=5):")
print(f"   CV AUC: {dt_cv_scores.mean():.4f} (+/- {dt_cv_scores.std():.4f})")

### Random Forest

In [None]:
# Scale features for Logistic Regression
rf = RandomForestClassifier(random_state=42, n_estimators=100)
rf_cv_scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='roc_auc')
print(f"\n3. Random Forest (n_estimators=100):")
print(f"   CV AUC: {rf_cv_scores.mean():.4f} (+/- {rf_cv_scores.std():.4f})")

### Model Comparison

In [None]:
# Calculate mean CV scores for each model
models = {
    'Logistic Regression': lr_cv_scores.mean(),
    'Decision Tree': dt_cv_scores.mean(),
    'Random Forest': rf_cv_scores.mean()
}

# Display results
for model_name, score in sorted(models.items(), key=lambda x: x[1], reverse=True):
    print(f"{model_name:25s}: {score:.4f}")

# Identify best model
best_model = max(models, key=models.get)
best_score = models[best_model]

print("\n" + "-" * 60)
print(f"BEST MODEL: {best_model}")
print(f"Best CV AUC Score: {best_score:.4f}")
print("=" * 60)


> Random Forest is selected for hyperparameter tuning as it achieved the highest cross-validation ROC-AUC score (0.8599) among all three models. Additionally, Random Forest can capture non-linear relationships in the data through its ensemble of decision trees, making it well-suited for this classification task.

## Hyperparameter Tuning(GridSearchCV)

Using GridSearchCV for hyperparameter optimization

Hyperparameters being tuned:
- `n_estimators`: Number of trees in the forest
- `max_depth`: Maximum depth of each tree
- `min_samples_split`: Minimum samples required to split a node
- `min_samples_leaf`: Minimum samples required at a leaf node

In [None]:
# Define parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

print(f"Parameter grid: {param_grid}")
print(f"Total combinations to test: {2 * 4 * 3 * 3} = 72")

rf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='roc_auc',
    n_jobs=-1,  # Use all available cores
    verbose=1
)

grid_search.fit(X_train, y_train)

print(f"\nBest Parameters Found:")
for param, value in grid_search.best_params_.items():
    print(f"  {param}: {value}")
print(f"\nBest Cross-Validation AUC: {grid_search.best_score_:.4f}")

## Final Model Training and Evaluation

In [None]:
# Train final model with best parameters
best_rf = grid_search.best_estimator_

# Make predictions on test set
y_pred = best_rf.predict(X_test)
y_pred_proba = best_rf.predict_proba(X_test)[:, 1]

# Confusion Matrix
print("\n1. CONFUSION MATRIX:")
print("-" * 40)
cm = confusion_matrix(y_test, y_pred)
print(f"                    Predicted")
print(f"                  Failed  Successful")
print(f"Actual Failed      {cm[0,0]:4d}      {cm[0,1]:4d}")
print(f"Actual Successful  {cm[1,0]:4d}      {cm[1,1]:4d}")

# Calculate metrics
print("\n2. CLASSIFICATION METRICS:")
print("-" * 40)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f"Accuracy:   {accuracy:.4f}  (% of total correctly classified)")
print(f"Precision:  {precision:.4f}  (% of predicted successes that were correct)")
print(f"Recall:     {recall:.4f}  (% of actual successes correctly predicted)")
print(f"F1 Score:   {f1:.4f}  (harmonic mean of precision and recall)")
print(f"ROC-AUC:    {roc_auc:.4f}  (probability ranking performance)")

print("\n3. CLASSIFICATION REPORT:")
print("-" * 40)
print(classification_report(y_test, y_pred, target_names=['Failed', 'Successful']))

## Feature Importance Analysis

In [None]:
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': best_rf.feature_importances_
}).sort_values('importance', ascending=False)

print("\nFeature Importance Ranking:")
print("-" * 50)
for rank, (_, row) in enumerate(feature_importance.iterrows(), 1):
    print(f"{rank:2d}. {row['feature']:35s} {row['importance']:.4f}")


## Model Interpretation

### Model Logic Explanation

The Random Forest classifier works by building multiple decision trees, each trained on a random subset of the data and features. Each tree makes
a prediction, and the final prediction is the majority vote across all trees. This ensemble approach reduces overfitting and improves generalization.

### Key Findings From Feature Importance

#### 1. Goal Amount `(log_goal_usd)` - MOST IMPORTANT (~22%)
   - Lower funding goals are more likely to succeed
   - Very high goals make projects harder to fully fund
   - The LOG transformation helps handle the wide range of goal values
     (from hundreds to millions of dollars)
   - Business insight: Creators should set realistic, achievable goals

#### 2. Campagin Duration (`campaign_duration_days`) - SECOND MOST IMPORTANT (~17%)
   - Campaign length significantly affects funding success
   - Too short campaigns may not allow enough time to build momentum
   - Too long campaigns may lose urgency and backer interest
   - Business insight: Optimal campaign length balances urgency with reach

#### 3. Project Name & Description (`name_length` ~12%, `blurb_length` ~11%)
   - Together these account for ~23% of predictive power
   - Longer, more descriptive project names attract attention
   - Detailed blurbs provide backers with necessary information
   - Business insight: Invest time in crafting clear, descriptive copy

#### 4. Project Category (`parent_category` - combined ~26%)
   - Music projects show the highest category importance (~8%)
   - Games and Journalism categories also show notable effects
   - Different categories have inherently different success dynamics
   - Business insight: Success benchmarks vary by category

#### 5. Video Presence (`has_video` ~7%)
   - Projects with videos are more likely to succeed
   - Videos help communicate vision and build trust with backers
   - Business insight: Creating a compelling video is worth the investment

#### 6. Geographic Location (`is_us` ~2%)
   - US-based projects show slightly different funding patterns
   - Relatively minor effect compared to other features
   - May reflect payment processing, shipping, or cultural factors


### Actionable Recommendations For Project Creators: