In [None]:
import plotly.graph_objects as go
import pandas as pd
import numpy as np

# Time grouping
df_clean['launch_month'] = pd.to_datetime(df_clean['launched'], errors='coerce').dt.to_period('M')
grouped = df_clean.groupby('launch_month')
total_projects = grouped.size()
total_goal = grouped['goal'].sum()
successful_projects = grouped['state'].apply(lambda x: (x == 'Successful').sum())
failed_projects = grouped['state'].apply(lambda x: (x == 'Failed').sum())
x = total_projects.index.to_timestamp()

# Max values for custom gridlines
y1_max = 10800
y2_max = 6e8

# Build interactive figure
fig = go.Figure()

# Left Y-axis lines
fig.add_trace(go.Scatter(x=x, y=total_projects, name='Total Projects', line=dict(color='#1f77b4')))
fig.add_trace(go.Scatter(x=x, y=successful_projects, name='Successful Projects', line=dict(color='#2ca02c')))
fig.add_trace(go.Scatter(x=x, y=failed_projects, name='Failed Projects', line=dict(color='#d62728')))

# Right Y-axis line
fig.add_trace(go.Scatter(
    x=x, y=total_goal, name='Total Goal Amount',
    yaxis='y2', line=dict(color='#9467bd', dash='dash')
))

# Update layout
fig.update_layout(
    template='plotly_white',
    height=600,
    width=1000,
    margin=dict(l=80, r=80, t=40, b=60),

    xaxis=dict(
        title='Launch Year',
        tickformat='%Y',
        showgrid=True,
        gridcolor='lightgrey',
        gridwidth=0.8,
        griddash='dash',
        tickangle=0,
        tickfont=dict(size=16),
        showline=True,
        linecolor='grey',
        ticks='outside'
    ),

    yaxis=dict(
        title='Number of Projects',
        range=[0, y1_max],
        tickvals=np.linspace(0, y1_max, 7),
        tickformat=',d',
        tickfont=dict(size=16),
        showgrid=True,
        gridcolor='lightgrey',
        gridwidth=0.8,
        griddash='dash',
        zeroline=False,
        showline=True,
        linecolor='grey'
    ),

    yaxis2=dict(
        title='Total Goal Amount (USD)',
        overlaying='y',
        side='right',
        range=[0, y2_max],
        tickvals=np.linspace(0, y2_max, 7),
        ticktext=[f'{int(val/1e6)}M' for val in np.linspace(0, y2_max, 7)],
        tickfont=dict(size=16, color='#9467bd'),
        showgrid=True,
        gridcolor='lightgrey',
        gridwidth=0.8,
        griddash='dash',
        showline=True,
        linecolor='#9467bd'
    ),

    legend=dict(
        x=0.01,
        y=0.99,
        bgcolor='white',
        bordercolor='lightgrey',
        borderwidth=1,
        font=dict(size=16)
    )
)

# Custom gridline at max values
fig.add_shape(type="line",
    x0=min(x), x1=max(x), y0=y1_max, y1=y1_max,
    line=dict(color="lightgrey", width=1, dash="dash"),
    xref='x', yref='y'
)
fig.add_shape(type="line",
    x0=min(x), x1=max(x), y0=y2_max, y1=y2_max,
    line=dict(color="lightgrey", width=1, dash="dash"),
    xref='x', yref='y2'
)

fig.show()

In [None]:
import numpy as np
import plotly.graph_objects as go
import pandas as pd

# Daten laden
df = pd.read_csv("../data/kickstarter_common.csv")

# Erfolgreiche und gescheiterte Projekte
df_ratio = df[df['state'].isin(['Successful', 'Failed'])].copy()

# Erfolgreiche Projekte pro Land/Kategorie
success_counts = df_ratio[df_ratio['state'] == 'Successful'].pivot_table(
    index='country', columns='category', values='id', aggfunc='count'
)
total_counts = df_ratio.pivot_table(
    index='country', columns='category', values='id', aggfunc='count'
)

# Erfolgsrate berechnen
success_rate = (success_counts / total_counts) * 100
success_rate = success_rate.round(0)

# Leere Zeilen/Spalten entfernen
success_rate_clean = success_rate.dropna(axis=0, how='all').dropna(axis=1, how='all')
success_counts_clean = success_counts.reindex_like(success_rate_clean)

# Zellen sollen quadratisch sein
n_y, n_x = success_rate_clean.shape
base_cell_size = 50
height = base_cell_size * n_y
width = base_cell_size * n_x

# Interaktive Heatmap mit korrekter Farbskala
fig = go.Figure(data=go.Heatmap(
    z=success_rate_clean.values,
    x=success_rate_clean.columns.tolist(),
    y=success_rate_clean.index.tolist(),
    text=success_rate_clean.astype("Int64").astype(str) + "%\n(" + success_counts_clean.fillna(0).astype(int).astype(str) + ")",
    hoverinfo='text',
    colorscale='YlGnBu',
    coloraxis="coloraxis"  # Important fix: link heatmap to layout coloraxis
))

# Layout mit korrektem coloraxis-Handling
fig.update_layout(

    yaxis_scaleanchor="x",
    autosize=False,
    margin=dict(l=220, r=0, t=75, b=30),

    xaxis=dict(
        tickangle=45,
        side='top',
        tickfont=dict(size=12),
        automargin=False,
        ticklabelposition="outside top"
    ),
    yaxis=dict(
        tickfont=dict(size=12),
        automargin=False,
        ticklabelposition="outside"
    ),

    coloraxis=dict(
        colorbar=dict(
            title="Success Rate (%)",
            lenmode="fraction",
            len=0.95,               # Full height of heatmap
            y=0.5,
            yanchor='middle',
            thickness=20,
            x=1.01,
            xanchor='left',
            ticks='outside'
        )
    )
)

fig.show()

# HTML export
html_output_path = "../figures/kickstarter_heatmap_square_cells_adjusted.html"
fig.write_html(html_output_path, include_plotlyjs='cdn')
fig.write_html("../figures/kickstarter_heatmap.html", include_plotlyjs='cdn')

In [None]:

plt.rcdefaults()
plt.style.use('fivethirtyeight')

bins = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5]
labels = ['0.0–0.1', '0.1–0.2', '0.2–0.3', '0.3–0.4', '0.4–0.5',
          '0.5–0.6', '0.6–0.7', '0.7–0.8', '0.8–0.9', '0.9–1.0',
          '1.0–1.1', '1.1–1.2', '1.2–1.3', '1.3–1.4', '1.4–1.5']


# Add one final bin for all ratios greater than or equal to 1.5

df['pledged_to_goal'] = df['pledged'] / df['goal'].replace(0, pd.NA)
df['pledged_ratio_bin'] = pd.cut(
    df['pledged_to_goal'],
    bins=bins + [float('inf')],
    labels=labels + ['>1.5'],
    include_lowest=True,
    right=False
)

# Count projects in each updated bin
final_bin_counts = df['pledged_ratio_bin'].value_counts().sort_index()
print(final_bin_counts)


# Count projects in each updated bin
final_bin_counts = df['pledged_ratio_bin'].value_counts().sort_index()

# Plot
plt.figure(figsize=(12, 6))
final_bin_counts.plot(kind='bar', color='steelblue')
plt.title('Number of Projects by Pledged-to-Goal Ratio Intervals (Extended)')
plt.xlabel('Pledged-to-Goal Ratio')
plt.ylabel('Number of Projects')
plt.xticks(rotation=45)
plt.tight_layout()
plt.grid(axis='y')
plt.show()

In [None]:
#Backers Modelling
# Re-import necessary packages due to code execution reset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, median_absolute_error, max_error
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_log_error, median_absolute_error, max_error, mean_poisson_deviance
from sklearn.linear_model import Ridge



# Reload dataset
df = pd.read_csv("../data/kickstarter_common.csv")

# Preprocess base features
df['pledged_to_goal'] = df['pledged'] / df['goal'].replace(0, pd.NA)
df['state'] = df['pledged_to_goal']
df['title_word_count'] = df['name'].fillna('').apply(lambda x: len(str(x).split()))

# Filter to near-successful campaigns
df_filtered = df[(df['state'] >= 1.0) & (df['state'] < 1.1)].copy()

# Add new feature: number of words in title
df_filtered['title_word_count'] = df_filtered['name'].fillna('').apply(lambda x: len(str(x).split()))

# Define features and target
features = ['goal', 'duration', 'launch_month', 'launch_year',
            'category', 'subcategory', 'country', 'title_word_count']
target = 'backers'



# Drop rows with missing essential data
df_filtered = df_filtered[df_filtered['goal'] > 0].dropna(subset=features + [target])

# Split into X and y
X = df_filtered[features]
y = df_filtered[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify categorical and numeric features
categorical_features = ['category', 'subcategory', 'country']
numeric_features = ['goal', 'duration', 'launch_month', 'launch_year', 'title_word_count']

# Correcting for older scikit-learn version: use 'sparse=False' instead of 'sparse_output'
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore', sparse=False), categorical_features)
])

# Use Ridge with a safe solver
ridge = Ridge(solver='auto')

# Build pipeline
pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('regressor', ridge)
])

# Grid search hyperparameters
param_grid = {
    'regressor__alpha': [0.01, 0.1, 1.0, 10.0, 100.0]
}

# Run grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_absolute_error')
grid_search.fit(X_train, y_train)

# Evaluate on test data
y_pred_test = grid_search.best_estimator_.predict(X_test)
mae_test = mean_absolute_error(y_test, y_pred_test)

# Evaluate best model
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
y_pred_best = best_model.predict(X_test)
mae_best = mean_absolute_error(y_test, y_pred_best)
rmse_best = mean_squared_error(y_test, y_pred_best, squared=False)
r2_best = r2_score(y_test, y_pred_best)

best_params, mae_best, rmse_best, r2_best

In [None]:
# Define columns
numeric_features = ['goal', 'duration', 'launch_month', 'launch_year', 'title_word_count']
categorical_features = ['category', 'subcategory', 'country']

# Define preprocessing
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric_features),
    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore', sparse=False), categorical_features)
])

# Define pipeline
pipeline_estimator = Pipeline([
    ('preprocessing', preprocessor),
    ('regressor', Ridge())
])

# ✅ Correct param grid with double underscore
param_grid = {
    'regressor__alpha': [0.01, 0.1, 1.0, 10.0, 100.0]
}

# Grid search with pipeline
grid_search = GridSearchCV(
    estimator=pipeline_estimator,
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_absolute_error'
)

grid_search.fit(X_train, y_train)

In [None]:
print("Best parameters:", grid_search.best_params_)


In [None]:
y_pred = best_model.predict(X_test)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer

# Create new pipeline using RandomForestRegressor
rf_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# Set parameter grid for tuning (small for speed, can expand later)
rf_param_grid = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [None, 10, 20],
    'regressor__min_samples_split': [2, 5]
}

# Define multiple scoring metrics
scoring_metrics = {
    'MAE': make_scorer(mean_absolute_error, greater_is_better=False),
    'RMSE': make_scorer(mean_squared_error, greater_is_better=False, squared=False),
    'R2': make_scorer(r2_score)
}

# Set up grid search with RandomForest
rf_grid_search = GridSearchCV(
    rf_pipeline,
    param_grid=rf_param_grid,
    cv=3,
    scoring='neg_mean_absolute_error',  # primary scoring
    refit='neg_mean_absolute_error',
    return_train_score=True,
    n_jobs=-1
)

# Fit the model
rf_grid_search.fit(X_train, y_train)

# Best model evaluation
rf_best_model = rf_grid_search.best_estimator_
y_pred_rf = rf_best_model.predict(X_test)

# Calculate evaluation metrics
mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = mean_squared_error(y_test, y_pred_rf, squared=False)
r2_rf = r2_score(y_test, y_pred_rf)
medae_rf = median_absolute_error(y_test, y_pred_rf)
maxe_rf = max_error(y_test, y_pred_rf)

{
    'Best Parameters': rf_grid_search.best_params_,
    'MAE': mae_rf,
    'RMSE': rmse_rf,
    'R2 Score': r2_rf,
    'Median Absolute Error': medae_rf,
    'Max Error': maxe_rf
}