In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('data/cleaned_encoded_kickstarter_projects.csv')
df.head()

Unnamed: 0,category,subcategory,country,launched,deadline,goal,pledged,backers,state,duration,launch_month,category_encoded,subcategory_encoded,country_encoded
0,Fashion,Fashion,United States,2009-04-21 21:02:48,2009-05-31,1000,625,30,Failed,39,4,5,52,21
1,Film & Video,Shorts,United States,2009-04-23 00:07:53,2009-07-20,80000,22,3,Failed,87,4,6,129,21
2,Art,Illustration,United States,2009-04-24 21:52:03,2009-05-03,20,35,3,Successful,8,4,0,70,21
3,Technology,Software,United States,2009-04-25 17:36:21,2009-07-14,99,145,25,Successful,79,4,13,131,21
4,Fashion,Fashion,United States,2009-04-27 14:10:39,2009-05-26,1900,387,10,Failed,28,4,5,52,21


In [3]:
# Define feature and target variable
X = df[['goal', 'duration', 'launch_month', 'category_encoded', 'subcategory_encoded', 'country_encoded']]
y = df['pledged']

X.shape

(370209, 6)

In [6]:
# Apply logarithmic transformation to 'goal' and 'pledged' columns
X['goal'] = np.log1p(X['goal'])  # log1p is used to handle zero values
y = np.log1p(y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['goal'] = np.log1p(X['goal'])  # log1p is used to handle zero values


In [7]:
# Initialize the RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42)

In [8]:
# Train the model
model.fit(X_train, y_train)

In [9]:
# Make predictions
y_pred = model.predict(X_test)

In [10]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 10.190362704431482
R-squared: 0.0720934054204373


In [11]:
# Feature importance
feature_importances = model.feature_importances_
feature_names = X.columns

for name, importance in zip(feature_names, feature_importances):
    print(f"{name}: {importance}")

goal: 0.35485817514988854
duration: 0.15291472367314035
launch_month: 0.20169299185690406
category_encoded: 0.0716831906649023
subcategory_encoded: 0.16483965010635893
country_encoded: 0.05401126854880585


In [13]:
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Initialize the model
rf = RandomForestRegressor(random_state=42)

# Perform grid search
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Get the best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print(f"Best Parameters: {best_params}")

Fitting 3 folds for each of 324 candidates, totalling 972 fits
Best Parameters: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300}
