In [None]:
#Imports
import pandas as pd
import numpy as np
import warnings
import seaborn as sns
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')

df = pd.read_csv('../data/kickstarter_common.csv')

print(df.info())
print(df.describe())

print(df['country'].value_counts())

## Step 1: Success Rate Encoding for category and country
We’ll calculate the success rate (proportion of "successful" projects) for each unique value in `category` and `country`, then map these rates back to the DataFrame as new columns. This will give us numerical representations of how successful each category and country tends to be, which we’ll use in clustering.

In [None]:
# Calculate success rate for 'category'
category_success_rate = df.groupby('category')['state'].apply(lambda x: (x == 'Successful').mean())
df['category_success_rate'] = df['category'].map(category_success_rate)

# Calculate success rate for 'country'
country_success_rate = df.groupby('country')['state'].apply(lambda x: (x == 'Successful').mean())
df['country_success_rate'] = df['country'].map(country_success_rate)

# Verify the new columns
print(df[['category', 'category_success_rate', 'country', 'country_success_rate']].head())

## Step 2: Prepare Features for Clustering
For clustering, we need a set of numerical features that define project similarity. We’ll use `goal`, `duration`, `launch_month`, `launch_year`, `category_success_rate`, and `country_success_rate`. Since goal might be skewed, we’ll apply a log transformation to it. We’ll also standardize all features to ensure they’re on the same scale, which is crucial for K-Means clustering.

In [None]:
from sklearn.preprocessing import StandardScaler

# Select features for clustering
features = ['goal', 'duration', 'launch_month', 'launch_year', 'category_success_rate', 'country_success_rate']

# Apply log transformation to 'goal' (add 1 to avoid log(0))
df['log_goal'] = np.log1p(df['goal'])

# Update features list with log_goal instead of goal
features = ['log_goal', 'duration', 'launch_month', 'launch_year', 'category_success_rate', 'country_success_rate']

# Extract the feature matrix
X = df[features]

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Check the scaled data
print("Scaled features shape:", X_scaled.shape)
print("Sample of scaled features:\n", X_scaled[:5])

## Step 3: Perform K-Means Clustering
We’ll use K-Means to cluster the projects. To choose the number of clusters (`k`), we’ll try a range of values and use the elbow method to find a reasonable `k`. Then, we’ll fit the model with the chosen `k` and assign cluster labels to each project.

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Elbow method to determine optimal k
inertia = []
k_range = range(2, 11)  # Testing k from 2 to 10
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

# Plot the elbow curve
plt.figure(figsize=(8, 5))
plt.plot(k_range, inertia, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')
plt.show()

# Choose k (e.g., 5) based on the elbow plot and fit the model
k = 5  # Adjust this based on your elbow plot
kmeans = KMeans(n_clusters=k, random_state=42)
df['cluster'] = kmeans.fit_predict(X_scaled)

# Verify cluster assignment
print(df[['id', 'category', 'country', 'state', 'cluster']].head())

## Step 4: Explore Clustering Outcomes
Now that we have cluster labels, let’s explore what they mean. We’ll look at the size of each cluster, the success rate within each cluster, and some summary statistics to understand the groupings.

In [None]:
# Cluster sizes
print("Cluster sizes:")
print(df['cluster'].value_counts())

# Success rate per cluster
cluster_success_rate = df.groupby('cluster')['state'].apply(lambda x: (x == 'Successful').mean())
print("\nSuccess rate per cluster:")
print(cluster_success_rate)

# Summary statistics for each cluster
print("\nSummary statistics by cluster:")
cluster_summary = df.groupby('cluster')[features].mean()
print(cluster_summary)

# Optional: Visualize success rate by cluster
cluster_success_rate.plot(kind='bar', figsize=(8, 5))
plt.xlabel('Cluster')
plt.ylabel('Success Rate')
plt.title('Success Rate by Cluster')
plt.show()

# Step 5: Find Closest Projects for a Sample Input
For your goal of providing the 5 closest failed and 5 closest successful projects, let’s simulate a new project input and find its nearest neighbors in the clustered data. We’ll use Euclidean distance on the scaled features.

In [None]:
from sklearn.metrics import pairwise_distances

# Simulate a new project input (example values)
new_project = {
    'goal': 10000,  # $10,000 goal
    'duration': 30,  # 30 days
    'launch_month': 4,  # April
    'launch_year': 2018,
    'category': 'Technology',  # Map to its success rate
    'country': 'United States'  # Map to its success rate
}

# Prepare the new project data
new_project['log_goal'] = np.log1p(new_project['goal'])
new_project['category_success_rate'] = category_success_rate[new_project['category']]
new_project['country_success_rate'] = country_success_rate[new_project['country']]
new_project_data = [new_project['log_goal'], new_project['duration'], new_project['launch_month'],
                    new_project['launch_year'], new_project['category_success_rate'], new_project['country_success_rate']]

# Scale the new project data
new_project_scaled = scaler.transform([new_project_data])

# Calculate distances to all projects
distances = pairwise_distances(new_project_scaled, X_scaled, metric='euclidean').flatten()

# Add distances to the DataFrame
df['distance_to_new'] = distances

# Get 5 closest successful and failed projects
closest_successful = df[df['state'] == 'Successful'].nsmallest(5, 'distance_to_new')
closest_failed = df[df['state'] == 'Failed'].nsmallest(5, 'distance_to_new')

# Get the most distant successful and failed projects
most_distant_successful = df[df['state'] == 'Successful'].nlargest(1, 'distance_to_new')
most_distant_failed = df[df['state'] == 'Failed'].nlargest(1, 'distance_to_new')

# Display results
print("\n5 Closest Successful Projects:")
print(closest_successful[['name', 'category', 'country', 'goal', 'pledged', 'state', 'distance_to_new']])
print("\n5 Closest Failed Projects:")
print(closest_failed[['name', 'category', 'country', 'goal', 'pledged', 'state', 'distance_to_new']])
print("\nMost Distant Successful Project:")
print(most_distant_successful[['name', 'category', 'country', 'goal', 'pledged', 'state', 'distance_to_new']])
print("\nMost Distant Failed Project:")
print(most_distant_failed[['name', 'category', 'country', 'goal', 'pledged', 'state', 'distance_to_new']])

## Step 6: Generate Links to the Closest Successful and Failed Projects
After identifying the 5 closest successful and 5 closest failed projects in Step 5 based on clustering and distance, we’ll generate hypothetical Kickstarter links for these projects using their id and a slugified version of their name. This will allow users to explore these similar projects directly.

In [None]:
import urllib.parse

# Function to create a Kickstarter search URL from project name
def create_search_url(project_name):
    # URL-encode the project name to handle spaces and special characters
    encoded_name = urllib.parse.quote(project_name)
    return f"https://www.kickstarter.com/discover/advanced?ref=nav_search&term={encoded_name}"

# Generate search URLs for the closest successful and failed projects
closest_successful["search_link"] = closest_successful["name"].apply(create_search_url)
closest_failed["search_link"] = closest_failed["name"].apply(create_search_url)

# Display results with search links
print("\n5 Closest Successful Projects with Search Links:")
print(closest_successful[['name', 'category', 'country', 'goal', 'pledged', 'state', 'distance_to_new', 'search_link']])
print("\n5 Closest Failed Projects with Search Links:")
print(closest_failed[['name', 'category', 'country', 'goal', 'pledged', 'state', 'distance_to_new', 'search_link']])

# Optional: Display clickable links in Jupyter Notebook
from IPython.display import display, HTML
successful_html = closest_successful[['name', 'search_link']].copy()
successful_html['search_link'] = successful_html['search_link'].apply(lambda x: f'<a href="{x}" target="_blank">{x}</a>')
failed_html = closest_failed[['name', 'search_link']].copy()
failed_html['search_link'] = failed_html['search_link'].apply(lambda x: f'<a href="{x}" target="_blank">{x}</a>')

print("\nClickable Links for Closest Successful Projects:")
display(HTML(successful_html.to_html(escape=False)))
print("\nClickable Links for Closest Failed Projects:")
display(HTML(failed_html.to_html(escape=False)))