In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import pycountry

Data Preparation

In [2]:
df = pd.read_excel("Kickstarter.xlsx")
grading_df = pd.read_excel("Kickstarter-Grading-Sample.xlsx")



#Function to convert country codes to country name
def get_country_name(code):
    try:
        return pycountry.countries.get(alpha_2=code).name
    except AttributeError:
        return "Unknown"


#Function for Pre-Processing Data for training data and grading data

#Function for Pre-Processing Data for training data and grading data

#1. Drop Columns that have no impact on prediction
df = df.drop(columns=['id','name'], axis=1)

#2. Drop Columns that can only be determined after knowing the state of the project
cols_unknown_at_launch = columns_to_drop = ['pledged', 'state_changed_at', 'backers_count', 'usd_pledged',
                   'state_changed_at_weekday', 'state_changed_at_month',
                   'state_changed_at_day', 'state_changed_at_yr',
                   'state_changed_at_hr', 'launch_to_state_change_days', 'spotlight', 'staff_pick']

df = df.drop(columns=cols_unknown_at_launch, axis=1)

#3. Drop Columns with repeated information in other columns:
cols_repeat_info = ['currency', 'deadline', 'created_at', 'launched_at']
df = df.drop(columns=cols_repeat_info, axis=1)

#4. Remove rows with state other than successful or failed
df = df[df['state'].isin(['successful', 'failed'])]
df['state'] = df['state'].apply(lambda x: 1 if x=='successful' else 0)

#5. Create new column to convert goal to USD and drop goal, rate column
df['goal_usd'] = df['goal'] * df['static_usd_rate']
df = df.drop(columns=['goal', 'static_usd_rate'], axis=1)

#6. Convert country to full name for easier interpretation
df['country'] = df['country'].apply(lambda x: get_country_name(x))

# Pre-processing Steps added after EDA

#7. Replace missing values in category column with Others (assuming these projects were not assigned a category)
df['category'] = df['category'].fillna('Others')

#8. Remove disable_communication column as it has only 1 value, so cannot contribute to prediction
df = df.drop(columns=['disable_communication'], axis=1)

#9. Keep only 1 column each for name_len and blurb_len - keeping name_len_clean and blurb_len as they have higher feature importance
corr_cols_remove = ['name_len','blurb_len_clean']
df = df.drop(columns=corr_cols_remove, axis=1)

#10. Remove created date related columns and keep only create_to_launch_days since it captures the required information. Also, it has low feature importance scores
created_date_cols_remove = ['created_at_weekday', 'created_at_month', 'created_at_day', 'created_at_yr', 'created_at_hr']
df = df.drop(columns=created_date_cols_remove, axis=1)

 #11. Remove year columns because we are predicting for future years
year_cols_remove = ['deadline_yr', 'launched_at_yr']
df = df.drop(columns=year_cols_remove, axis=1)

#12. Categorize countries into US and Non-US
df['country'] = df['country'].apply(lambda x: 'US' if x=='United States' else 'Non-US')

#13. Dummify categorical columns
categorical_cols = ['country', 'category', 'deadline_weekday', 'launched_at_weekday']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)


X = df.drop(['state'], axis=1)
y = df['state']

Remove Outliers using Isolation Forest

In [3]:
#preprocessed_outliers = df.drop(columns=categorical_cols, axis=1)
from sklearn.ensemble import IsolationForest
isolforest = IsolationForest(contamination=0.05,random_state=0)
pred = isolforest.fit_predict(df)

from numpy import where
anomaly_index = where(pred==-1)
anomaly_values = df.iloc[anomaly_index]

X.drop(anomaly_values.index,inplace=True, errors='ignore')
y.drop(anomaly_values.index,inplace=True, errors='ignore')

df.drop(anomaly_values.index,inplace=True, errors='ignore')

Train-Test Split Without Standardizing Predictors

In [4]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=123)

Code for Standardizing Numeric Predictors

In [5]:
from sklearn.preprocessing import MinMaxScaler

# Standardize predictors
scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Logistic Regression

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Creating and training the logistic regression model
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)

# Making predictions on the test set
y_pred = logistic_model.predict(X_test)

# Calculating evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Printing the evaluation metrics
print(f"Accuracy Score: {accuracy:.4f}")
print(f"Precision Score: {precision:.4f}")
print(f"Recall Score: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy Score: 0.6819
Precision Score: 0.6606
Recall Score: 0.1633
F1 Score: 0.2618


KNN

In [27]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Creating and training the KNN model
knn_model = KNeighborsClassifier()
knn_model.fit(X_train_scaled, y_train)

# Making predictions on the test set
y_pred = knn_model.predict(X_test_scaled)

# Calculating evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Printing the evaluation metrics
print(f"Accuracy Score: {accuracy:.4f}")
print(f"Precision Score: {precision:.4f}")
print(f"Recall Score: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy Score: 0.6663
Precision Score: 0.5218
Recall Score: 0.4070
F1 Score: 0.4573


Hyperparameter Tuning for KNN

In [28]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

# Define the parameter grid
param_grid = {
    'n_neighbors': [1, 3, 5, 7, 10],  
    'weights': ['uniform', 'distance']
}

# Create the KNN model
knn_model = KNeighborsClassifier()

# Perform GridSearchCV to find the best parameters
grid_search = GridSearchCV(estimator=knn_model, param_grid=param_grid, scoring='accuracy', cv=5)
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters and corresponding model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Make predictions on the test set with the best model
y_pred = best_model.predict(X_test_scaled)

# Calculate evaluation metrics without specifying average for precision and recall
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the best parameters and evaluation metrics
print(f"Best Parameters: {best_params}")
print(f"Accuracy Score: {accuracy:.4f}")
print(f"Precision Score: {precision:.4f}")
print(f"Recall Score: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Best Parameters: {'n_neighbors': 10, 'weights': 'uniform'}
Accuracy Score: 0.6769
Precision Score: 0.5643
Recall Score: 0.2834
F1 Score: 0.3774


Decision Trees

In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Creating and training the Decision Tree model
tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)

# Making predictions on the test set
y_pred = tree_model.predict(X_test)

# Calculating evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Printing the evaluation metrics
print(f"Accuracy Score: {accuracy:.4f}")
print(f"Precision Score: {precision:.4f}")
print(f"Recall Score: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy Score: 0.6498
Precision Score: 0.4808
Recall Score: 0.4640
F1 Score: 0.4723


Hyperparameter Tuning for Decision Trees

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define the parameter grid
param_grid = {
    'max_depth': [3, 5, 7, 10],  
    'min_samples_split': [2, 5, 10],  
    'min_samples_leaf': [1, 2, 4]  
}

# Create the Decision Tree model
tree_model = DecisionTreeClassifier()

# Perform GridSearchCV to find the best parameters
grid_search = GridSearchCV(estimator=tree_model, param_grid=param_grid, scoring='accuracy', cv=5)
grid_search.fit(X_train, y_train)

# Get the best parameters and corresponding model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Make predictions on the test set with the best model
y_pred = best_model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the best parameters and evaluation metrics
print(f"Best Parameters: {best_params}")
print(f"Accuracy Score: {accuracy:.4f}")
print(f"Precision Score: {precision:.4f}")
print(f"Recall Score: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Best Parameters: {'max_depth': 7, 'min_samples_leaf': 4, 'min_samples_split': 5}
Accuracy Score: 0.7039
Precision Score: 0.5908
Recall Score: 0.4002
F1 Score: 0.4772


Random Forest

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

randomforest = RandomForestClassifier(random_state=123, n_estimators=150, max_features='sqrt')
model_rf = randomforest.fit(X_train, y_train)

y_pred = model_rf.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the evaluation metrics
print(f"Accuracy Score: {accuracy:.4f}")
print(f"Precision Score: {precision:.4f}")
print(f"Recall Score: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy Score: 0.7289
Precision Score: 0.6502
Recall Score: 0.4269
F1 Score: 0.5154


Hyperparameter Tuning for Random Forest

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 150],  
    'max_features': [1,5,10]
}

# Create the Random Forest model
forest_model = RandomForestClassifier()

# Perform GridSearchCV to find the best parameters
grid_search = GridSearchCV(estimator=forest_model, param_grid=param_grid, scoring='accuracy', cv=5)
grid_search.fit(X_train, y_train)

# Get the best parameters and corresponding model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Make predictions on the test set with the best model
y_pred = best_model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the best parameters and evaluation metrics
print(f"Best Parameters: {best_params}")
print(f"Accuracy Score: {accuracy:.4f}")
print(f"Precision Score: {precision:.4f}")
print(f"Recall Score: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Best Parameters: {'max_features': 10, 'n_estimators': 150}
Accuracy Score: 0.7278
Precision Score: 0.6362
Recall Score: 0.4524
F1 Score: 0.5288


Gradient Boosting

In [10]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


gbt = GradientBoostingClassifier(random_state=123, n_estimators=400, max_features='sqrt')
model_gbt = gbt.fit(X_train, y_train)

y_pred = model_gbt.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the evaluation metrics
print(f"Accuracy Score: {accuracy:.4f}")
print(f"Precision Score: {precision:.4f}")
print(f"Recall Score: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy Score: 0.7403
Precision Score: 0.6524
Recall Score: 0.4942
F1 Score: 0.5624


Hyperparameter Tuning For Gradient Boosting

In [11]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define the GradientBoostingRegressor model
gbt = GradientBoostingClassifier(random_state=0)

# Define a grid of hyperparameters to search
param_grid = {
    'n_estimators': [50, 100, 150],  # Number of boosting stages
    'max_features': [5, 10, 15]  # Maximum number of features per tree
}

# Create the Gradient Boosting model
gb_model = GradientBoostingClassifier()

# Perform GridSearchCV to find the best parameters
grid_search = GridSearchCV(estimator=gb_model, param_grid=param_grid, scoring='accuracy', cv=5)
grid_search.fit(X_train, y_train)

# Get the best parameters and corresponding model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Make predictions on the test set with the best model
y_pred = best_model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the best parameters and evaluation metrics
print(f"Best Parameters: {best_params}")
print(f"Accuracy Score: {accuracy:.4f}")
print(f"Precision Score: {precision:.4f}")
print(f"Recall Score: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Best Parameters: {'max_features': 15, 'n_estimators': 150}
Accuracy Score: 0.7427
Precision Score: 0.6640
Recall Score: 0.4814
F1 Score: 0.5582


Artificial Neural Network

In [53]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

mlp_classifier = MLPClassifier(hidden_layer_sizes=(64, 32), momentum=0.9, random_state=0)

mlp_classifier.fit(X_train_scaled, y_train)

y_test_pred = mlp_classifier.predict(X_test_scaled)

# Calculate evaluation metrics for classification
accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)

# Print evaluation metrics
print(f"Accuracy Score: {accuracy:.4f}")
print(f"Precision Score: {precision:.4f}")
print(f"Recall Score: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy Score: 0.6835
Precision Score: 0.5484
Recall Score: 0.4751
F1 Score: 0.5091


Hyperparameter Tuning for Artificial Neural Network

In [58]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define the parameter grid
param_grid = {
    'hidden_layer_sizes': [(64, 32), (32, 16)],  # Change values as needed
    'momentum': [0.9, 0.95]  # Change values as needed
}

# Create the MLPClassifier model
mlp_classifier = MLPClassifier(random_state=0)

# Perform GridSearchCV to find the best parameters
grid_search = GridSearchCV(estimator=mlp_classifier, param_grid=param_grid, scoring='accuracy', cv=5)
grid_search.fit(X_train_scaled_encoded, y_train)

# Get the best parameters and corresponding model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Make predictions on the test set with the best model
y_test_pred = best_model.predict(X_test_scaled_encoded)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)

# Print the best parameters and evaluation metrics
print(f"Best Parameters: {best_params}")
print(f"Accuracy Score: {accuracy:.4f}")
print(f"Precision Score: {precision:.4f}")
print(f"Recall Score: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Best Parameters: {'hidden_layer_sizes': (64, 32), 'momentum': 0.9}
Accuracy Score: 0.7099
Precision Score: 0.5675
Recall Score: 0.5473
F1 Score: 0.5572
