In [None]:
# Loading libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot

In [None]:
# Import the dataset
kickstarter = pd.read_excel('Kickstarter.xlsx')

In [None]:
# For the purpose of this project, we only need to include projects with "successful" or "failure" state
df = kickstarter[kickstarter['state'].isin(['successful','failed'])]

# Convert the target variable to binary
df['state'] = df['state'].replace(['successful','failed'],[1,0])

# Convert goal to usd_goal = goal * static_usd_rate
df['usd_goal'] = df['goal'] * df['static_usd_rate']
df = df.drop(columns = 'goal')

df.shape

# 1. Data Preprocessing

In [None]:
# Check the number of rows and columns 
df.shape

### 1.1 Detect Duplicated Records

In [None]:
# Check if there is any duplicated records
df = df.drop_duplicates()

### 1.2 Drop out-of-scope predictors

According to the project instruction, we can only use the predictors "that are available at the moment when a new project is launched."
Therefore, we do not need any predictors regarding 'states' of the project.

In [None]:
df = df.drop(columns = ['state_changed_at','state_changed_at_weekday','state_changed_at_month', 'state_changed_at_day', 'state_changed_at_yr','state_changed_at_hr','launch_to_state_change_days'])

The information about pledged, staff_pick, backers_count, spotlight wont be available at the moment when the project is launched. Therefore, we remove them as well.

In [None]:
df = df.drop(columns = ['pledged','usd_pledged','staff_pick','backers_count','spotlight'])

### 1.3 Identify unique identifiers

Look at the number of unique values in each column

In [None]:
df.nunique()

Note that disable_communication only have one unique value, so it won't be useful for our prediction.

[id, name, deadline, created_at, launched_at] are almost a unique identifier, so we should drop them as well.

In [None]:
df= df.drop(columns = ['disable_communication','id','name','deadline', 'created_at', 'launched_at'])

### 1.4 Handle with Missing Values

In [None]:
# Check if there is any missing values
missing_values = np.where(pd.isnull(df))

# Identify the columns that contain missing values
df.columns[list(set(np.where(pd.isnull(df))[1]))]

In [None]:
# Count the percentage of missing values
len(missing_values[0]) / df.shape[0]

10% of missing values is not that small. Let's try look into the column.

In [None]:
df['category'].unique()

Instead of dropping the missing values, we can replace the null value with 'Unknown'.

In [None]:
df['category'] = df['category'].fillna('Unknown')
df.shape

### 1.5 Detect collinearity between variables

In [None]:
# check if there is any collinearity between variables
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(26, 6))
heatmap = sns.heatmap(df.corr(), vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':18}, pad=12);
plt.savefig('heatmap_classification.png', dpi=300, bbox_inches='tight')

According to the correlation heatmap, following varaibles have high correlation:
- name_len and name_len_clean, 
- blurb_len and blurb_len_clean,
- deadline_yr and created_at_yr and launched_at_yr

For each pair, we only need to keep one of them.

In [None]:
df = df.drop(columns = ['name_len_clean','blurb_len_clean','created_at_yr', 'launched_at_yr'])

In [None]:
df['state'].value_counts()

### 1.6 Handle Categorical Variables

In [None]:
# Check variable types
df.dtypes

In [None]:
# For 'weekday' variables. convert them into numerical variable from 1-7
cols = ['deadline_weekday','created_at_weekday','launched_at_weekday']
df[cols] = df[cols].replace(['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'],[1,2,3,4,5,6,7])

In [None]:
df['category'].value_counts()

In [None]:
df['country'] = np.where(df['country'] == 'US',1,0)
df['currency'] = np.where(df['currency'] == 'USD',1,0)

# Then country and currency will become idential, drop one
df = df.drop(columns=['currency'])

In [None]:
df = pd.get_dummies(df, columns = ['country','category'])
df.head()

### 1.7 Remove Outliers

In [None]:
from sklearn.ensemble import IsolationForest
iforest = IsolationForest(n_estimators=100,contamination=.05,random_state=0)

newdf = df[['usd_goal','static_usd_rate','name_len','blurb_len','create_to_launch_days','launch_to_deadline_days']]
pred = iforest.fit_predict(newdf)
score = iforest.decision_function(newdf)

# Extracting anomalies
from numpy import where
anomaly_index = where(pred==-1)
anomaly_values = df.iloc[anomaly_index]

anomaly_values
for idx in anomaly_index:
    df = df.drop(idx, errors='ignore')

df.shape

## 2. Feature Selection

In [None]:
X = df.loc[:,df.columns != 'state']
y = df['state']

# Standardize the predictors
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

In [None]:
# Feature selection using Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance

import matplotlib.pyplot as plt
import seaborn as sns

rf = RandomForestClassifier(random_state=0)
model = rf.fit(X_std, y)

result = permutation_importance(rf, X_std, y, n_repeats=1,
                                random_state=0)
perm_sorted_idx = result.importances_mean.argsort()

tree_importance_sorted_idx = np.argsort(rf.feature_importances_)
tree_indices = np.arange(0, len(rf.feature_importances_)) + 0.5

fig, (ax1) = plt.subplots(1, 1, figsize=(6, 10))
ax1.barh(tree_indices,
         rf.feature_importances_[tree_importance_sorted_idx], height=0.7)
ax1.set_yticklabels(X.columns[tree_importance_sorted_idx])
ax1.set_yticks(tree_indices)
ax1.set_ylim((0, len(rf.feature_importances_)))

fig.tight_layout()
plt.show()
plt.savefig('RandomForest_FeatureImportance.png', dpi=300, bbox_inches='tight')

# Print feature importance
#pd.Series(model.feature_importances_, index = X.columns).sort_values(ascending = False).plot(kind = 'bar',figsize = (14,6))

In [None]:
X = df.loc[:,df.columns != 'state']
sorted_features = pd.Series(model.feature_importances_, index = X.columns).sort_values(ascending = False)
chosen_features = sorted_features[:15].index.to_list()
#chosen_features = sorted_features[sorted_features > 0.04].index.to_list()


# 3. Classification Models

In [None]:
X = df[chosen_features]

#X = df.loc[:,df.columns != 'state']
y = df["state"]

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size = 0.33, random_state = 5)

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter = 1000)

from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth=10)

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(oob_score=True)

from sklearn.ensemble import GradientBoostingClassifier
gbt = GradientBoostingClassifier(random_state = 0)

from sklearn.neural_network import MLPClassifier
ann = MLPClassifier(hidden_layer_sizes=(3), random_state=0)

from sklearn.svm import SVC
svm = SVC(kernel="linear", random_state=0) #, C=0.5, gamma = 3)

from sklearn.model_selection import cross_val_score
scores_log = cross_val_score(lr, X=X_std, y=y, cv=5)
scores_rf = cross_val_score(rf, X=X_std, y=y, cv=5)
scores_gbt = cross_val_score(gbt, X=X_std, y=y, cv=5)
scores_ann = cross_val_score(ann, X=X_std, y=y, cv=5)
scores_svm = cross_val_score(svm, X=X_std, y=y, cv=5)

print(scores_log, scores_rf,scores_gbt,scores_ann, scores_svm, sep='\n')

### 3.1 Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter = 1000)
model_logit = lr.fit(X_train, y_train)

### 3.2 DecisionTree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(max_depth=10) # default is to grow a full tree
                                    # avoid overfitting
model_dt = dt.fit(X_train,y_train)

### 3.3 Random Forest

In [None]:
# Build the model
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(oob_score=True)
model_rf = rf.fit(X_train, y_train)

model_rf.oob_score_

### 3.4 Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbt = GradientBoostingClassifier(random_state = 0)
model_gbt = gbt.fit(X_train, y_train)

### 3.5 K-Nearest Neighbors

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

accuracy = 0
bestK = 0
for i in range (1,21):
    knn = KNeighborsClassifier(n_neighbors=i)
    model = knn.fit(X_train,y_train)
    y_test_pred = model.predict(X_test)
    if accuracy_score(y_test, y_test_pred) > accuracy:
        accuracy = accuracy_score(y_test, y_test_pred)
        bestK = i
    #print("k = ", i, "accuracy =", accuracy_score(y_test, y_test_pred))

knn = KNeighborsClassifier(n_neighbors=bestK) #,  weights = 'distance')
model_knn = knn.fit(X_train, y_train)

### 3.6 Artificial Neural Network

In [None]:
# Find the optimal size of hidden layer
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
for i in range(1,21): 
    model = MLPClassifier(hidden_layer_sizes=(i), max_iter = 1000, random_state=0)
    scores = cross_val_score(model, X=X_std, y=y, cv=10)
    print(i, ":", np.average(scores))

In [None]:
from sklearn.neural_network import MLPClassifier
ann = MLPClassifier(hidden_layer_sizes=(3), random_state=0)
model_mlp = ann.fit(X_train,y_train)

model_metrics(model_mlp,X_test,y_test)

In [None]:
## Find the best hyper-parameter
from sklearn.model_selection import GridSearchCV
mlp = MLPClassifier(max_iter=5000, random_state=0)

parameter = {'hidden_layer_sizes': range(1,22)}
grid_search = GridSearchCV(estimator = mlp, param_grid= parameter, 
                           scoring = "accuracy", verbose=True)
model_mlp = grid_search.fit(X_std, y)

### 3.7 Support Vector Machine

In [None]:
# Find the optimal gamma
from sklearn.svm import SVC
for i in range(1,11):
    svm_rbf = SVC(kernel = "rbf", random_state=0, C =0.5, gamma = i)
    model_rbf = svm_rbf.fit(X_train,y_train)
    scores = cross_val_score(model_rbf,X=X_test, y=y_test, cv=10)
    print("gamma = ",i,", score = ", sum(scores)/len(scores))

In [None]:
# Build the SVM model using a linear model
from sklearn.svm import SVC
svm = SVC(kernel="linear", random_state=0, C=0.5, gamma = 3)
model_svm = svm.fit(X_train, y_train)

### Model Performance Comparison

In [None]:
from sklearn import metrics

def model_metrics(model,X,y):
    y_pred = model.predict(X)

    accuracy = metrics.accuracy_score(y, y_pred)
    precision = metrics.precision_score(y, y_pred)
    recall = metrics.recall_score(y, y_pred)
    f1_score = metrics.f1_score(y, y_pred)

    model_metrics = [accuracy, precision, recall, f1_score]
    return model_metrics

In [None]:
model_performance = {
    'Logitstic': model_metrics(model_logit,X_test,y_test),
    'KNN': model_metrics(model_knn,X_test,y_test),

    'DecisionTree': model_metrics(model_dt,X_test,y_test),
    'RandomForest': model_metrics(model_rf,X_test,y_test),
    'GradientBoosting': model_metrics(model_gbt,X_test,y_test),

    'ANN': model_metrics(model_mlp,X_test,y_test),
    'SVM': model_metrics(model_svm,X_test,y_test)    
}

pd.DataFrame.from_dict(model_performance, orient='index',columns = ['accuracy','percision','recall','f1_score'])

# 4. Evaluation on Grading Dataset

In [None]:
# Loading libraries
import pandas as pd
import numpy as np

# Import the dataset
grading = pd.read_excel('Kickstarter-Grading-Sample.xlsx')

# For the purpose of this project, we only need to include projects with "successful" or "failure" state
df_test = grading[grading['state'].isin(['successful','failed'])]
df_test['state'] = df_test['state'].replace(['successful','failed'],[1,0])

# Convert goal to usd_goal = goal * static_usd_rate
df_test['usd_goal'] = df_test['goal'] * df_test['static_usd_rate']
df_test = df_test.drop(columns = 'goal')

# Check if there is any duplicated records
df_test = df_test.drop_duplicates()

### Drop out-of-scope predictors
df_test = df_test.drop(columns = ['state_changed_at','state_changed_at_weekday','state_changed_at_month', 
'state_changed_at_day', 'state_changed_at_yr','state_changed_at_hr','launch_to_state_change_days',
'pledged','staff_pick','backers_count','spotlight','disable_communication',
'id','name','deadline', 'created_at', 'launched_at','usd_pledged','name_len_clean','created_at_yr', 'launched_at_yr'])

### Handle Categorical Variables
cols = ['deadline_weekday','created_at_weekday','launched_at_weekday']
df_test[cols] = df_test[cols].replace(['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'],[1,2,3,4,5,6,7])

# dummify the other categorical variables
df_test['category'] = df_test['category'].fillna('Unknown')

df_test['country'] = np.where(df_test['country'] == 'US',1,0)
df_test['currency'] = np.where(df_test['currency'] == 'USD',1,0)
df_test = df_test.drop(columns=['currency'])
df_test = pd.get_dummies(df_test, columns = ['country','category'])

#df_test = pd.get_dummies(df_test, columns = ['country','currency','category'])

# Testing
X_grade = df_test[chosen_features]
#.loc[:,df.columns != 'state']
y_grade = df_test["state"]

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_grade = scaler.fit_transform(X_grade)


### PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
pca.fit(X_std)
# X_grade = pca.transform(X_grade)

In [None]:
test_performance = {
    'Logitstic': model_metrics(model_logit,X_grade,y_grade),
    'KNN': model_metrics(model_knn,X_grade,y_grade),

    'DecisionTree': model_metrics(model_dt,X_grade,y_grade),
    'RandomForest': model_metrics(model_rf,X_grade,y_grade),
    'GradientBoosting': model_metrics(model_gbt,X_grade,y_grade),

    'ANN': model_metrics(model_mlp,X_grade,y_grade),
    'SVM': model_metrics(model_svm,X_grade,y_grade)
}

pd.DataFrame.from_dict(test_performance, orient='index',columns = ['accuracy','percision','recall','f1_score'])