In [212]:
# Loading libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot

In [213]:
# Import the dataset
kickstarter = pd.read_excel('Kickstarter.xlsx')

In [214]:
# For the purpose of this project, we only need to include projects with "successful" or "failure" state
df = kickstarter[kickstarter['state'].isin(['successful','failed'])]

# Convert the target variable to binary
df['state'] = df['state'].replace(['successful','failed'],[1,0])

df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['state'] = df['state'].replace(['successful','failed'],[1,0])


Unnamed: 0,id,name,goal,pledged,state,disable_communication,country,currency,deadline,state_changed_at,...,created_at_day,created_at_yr,created_at_hr,launched_at_month,launched_at_day,launched_at_yr,launched_at_hr,create_to_launch_days,launch_to_deadline_days,launch_to_state_change_days
0,1601563193,Our future,100000000.0,1.0,0,False,AU,AUD,2014-10-07 12:05:39,2014-10-07 12:05:39,...,1,2014,21,8,8,2014,12,6,60,60
1,880009511,Elite: Dangerous,1250000.0,1578316.08,1,False,GB,GBP,2013-01-04 18:00:57,2013-01-04 18:00:57,...,31,2012,11,11,5,2012,18,5,60,60
4,557230947,"Bring Reading Rainbow Back for Every Child, Ev...",1000000.0,5408916.95,1,False,US,USD,2014-07-02 14:00:00,2014-07-02 14:00:11,...,22,2014,22,5,28,2014,8,35,35,35
5,1966069095,ARKYD: A Space Telescope for Everyone,1000000.0,1505366.6,1,False,US,USD,2013-06-30 21:00:00,2013-06-30 21:00:34,...,10,2012,17,5,29,2013,10,322,32,32
6,2083255961,A Billion Pixels...,100000000.0,56.0,0,False,US,USD,2014-08-04 16:39:34,2014-08-04 16:39:34,...,1,2014,16,6,5,2014,16,34,60,60


# 1. Data Preprocessing

In [None]:
# Check the number of rows and columns 
df.shape

### 1.1 Detect Duplicated Records

In [215]:
# Check if there is any duplicated records
df = df.drop_duplicates()

### 1.2 Detect Missing Values

In [None]:
# Check if there is any missing values
missing_values = np.where(pd.isnull(df))

In [None]:
# Identify the columns that contain missing values
df.columns[list(set(np.where(pd.isnull(df))[1]))]

In [None]:
# Count the percentage of missing values
len(missing_values[0]) / df.shape[0]

The predictors might be important for our future prediction.
And the number of records that contains missing values isn't significant in this dataset.
Hence we drop the missing values.

In [216]:
df = df.dropna()
df.shape

(12180, 45)

### 1.3 Drop out-of-scope predictors

According to the project instruction, we can only use the predictors "that are available at the moment when a new project is launched."
Therefore, we do not need any predictors regarding 'states' of the project.

In [217]:
df = df.drop(columns = ['state_changed_at','state_changed_at_weekday','state_changed_at_month', 'state_changed_at_day', 'state_changed_at_yr','state_changed_at_hr','launch_to_state_change_days'])

The information about pledged, staff_pick, backers_count, spotlight wont be available at the moment when the project is launched. Therefore, we remove them as well.

In [None]:
df = df.drop(columns = ['pledged','usd_pledged','staff_pick','backers_count','spotlight'])

### 1.4 Identify unique identifiers

In [None]:
# Number of unique values
df.nunique()

In [None]:
# Note that disable_communication only have one unique value
# So it won't be useful for our prediction
df = df.drop(columns = ['disable_communication'])

# we can drop [id, name, deadline, created_at, launched_at] as they are almost a unique identifier
df= df.drop(columns = ['id','name','deadline', 'created_at', 'launched_at'])

### 1.5 Detect collinearity between variables

In [None]:
# check if there is any collinearity between variables
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(26, 6))
heatmap = sns.heatmap(df.corr(), vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':18}, pad=12);
plt.savefig('heatmap.png', dpi=300, bbox_inches='tight')

According to the correlation heatmap, following varaibles have high correlation:
- name_len and name_len_clean, 
- blurb_len and blurb_len_clean,
- deadline_yr and created_at_yr and launched_at_yr

For each pair, we only need to keep one of them.

In [None]:
df = df.drop(columns = ['name_len_clean','blurb_len_clean','created_at_yr', 'launched_at_yr'])

### 1.6 Handle Categorical Variables

In [None]:
# Check variable types
df.dtypes

In [None]:
# For 'weekday' variables. convert them into numerical variable from 1-7
cols = ['deadline_weekday','created_at_weekday','launched_at_weekday']
df[cols] = df[cols].replace(['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'],[1,2,3,4,5,6,7])

Before dummifying the categorical variables, first see how many values each category has.

In [None]:
# dummify the other categorical variables
df = pd.get_dummies(df, columns = ['country','currency','category'])

## 2. Feature Selection

In [None]:
X = df.loc[:,df.columns != 'state']
y = df['state']

### 2.1 Using LASSO

In [None]:
## Standardize the predictors
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_std_lasso = scaler.fit_transform(X)

## Run LASSO
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=0.01)
model_lasso = lasso.fit(X_std_lasso,y)

model_lasso.coef_

test = pd.DataFrame(list(zip(X.columns,model_lasso.coef_)), columns = ['predictor','coefficient'])
test[abs(test['coefficient']) >= 0.01].sort_values(by = 'coefficient')

### 2.2 Using Random Forest

In [None]:
# Split the dataset
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33,random_state=5)

# Build the model
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state = 0)
model_rf = rf.fit(X_train, y_train)

# Print feature importance
pd.Series(model_rf.feature_importances_, index = X.columns).sort_values(ascending = False).plot(kind = 'bar',figsize = (14,6))

In [None]:
pd.Series(model_rf.feature_importances_, index = X.columns).sort_values(ascending = False)
model_rf.feature_importances_[model_rf.feature_importances_ > 0.01]

# 3. Classification Models

In [None]:
# Using predictors found by Random Forest
#X = df[['goal','create_to_launch_days','name_len','launch_to_deadline_days','launched_at_hr',
# 'launched_at_day','created_at_day','created_at_hr','deadline_day','blurb_len',
# 'category_Web','created_at_month','deadline_month', 'launched_at_month','created_at_weekday',
# 'launched_at_weekday','launched_at_weekday','deadline_yr','category_Software','static_usd_rate',
# 'category_Plays','category_Festivals']]

# Predictors found by LASSO
X = df[['category_Web','category_Software','category_Plays','name_len','launch_to_deadline_days','deadline_yr',
 'category_Festivals','category_Musical','category_Shorts','category_Experimental','category_Places',
 'category_Immersive', 'launched_at_hr']]

# combine the top selections?
#X = df[['goal','create_to_launch_days','name_len','launch_to_deadline_days','launched_at_hr',
#    'launched_at_day','created_at_day','created_at_hr','deadline_day','blurb_len',
#    'category_Web','category_Software','category_Plays','name_len','launch_to_deadline_days','deadline_yr',
#    'category_Festivals','category_Musical','category_Shorts','category_Experimental','category_Places']] 

# X = df[['name_len','launch_to_deadline_days','category_Web','deadline_yr','category_Software','category_Plays','category_Festivals']]

y = df["state"]

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size = 0.30, random_state = 5)

In [None]:
### PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=3)

pca.fit(X_std)
X_new = pca.transform(X_std)

X_train, X_test, y_train, y_test = train_test_split(X_new,y,test_size=0.30, random_state=5)

### 3.1 Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter = 500)
model_logit = lr.fit(X_train, y_train)

### 3.2 K-Nearest Neighbors

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

accuracy = 0
bestK = 0
for i in range (1,21):
    knn = KNeighborsClassifier(n_neighbors=i)
    model = knn.fit(X_train,y_train)
    y_test_pred = model.predict(X_test)
    if accuracy_score(y_test, y_test_pred) > accuracy:
        accuracy = accuracy_score(y_test, y_test_pred)
        bestK = i
    #print("k = ", i, "accuracy =", accuracy_score(y_test, y_test_pred))

knn = KNeighborsClassifier(n_neighbors=bestK) #,  weights = 'distance')
model_knn = knn.fit(X_train, y_train)

### 3.3 CART

In [None]:
from sklearn.tree import DecisionTreeClassifier
decisiontree = DecisionTreeClassifier(max_depth=10) # default is to grow a full tree
                                    # avoid overfitting
model_dt = decisiontree.fit(X_train,y_train)

### 3.4 Random Forest

In [None]:
# Build the model
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state = 0, oob_score=True)
model_rf = rf.fit(X_train, y_train)

model_rf.oob_score_

### 3.5 Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbt = GradientBoostingClassifier(random_state = 0)
model_gbt = gbt.fit(X_train, y_train)

### 3.6 Artificial Neural Network

In [None]:
# Find the optimal size of hidden layer
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
for i in range(1,21): 
    model = MLPClassifier(hidden_layer_sizes=(i), max_iter = 1000, random_state=0)
    scores = cross_val_score(model, X=X_std, y=y, cv=10)
    print(i, ":", np.average(scores))

In [None]:
from sklearn.neural_network import MLPClassifier
model_ann = MLPClassifier(hidden_layer_sizes=(14), random_state=0)
model_mlp = model_ann.fit(X_train,y_train)

In [None]:
## Find the best hyper-parameter
from sklearn.model_selection import GridSearchCV
mlp = MLPClassifier(max_iter=5000, random_state=0)

parameter = {'hidden_layer_sizes': range(1,22)}
grid_search = GridSearchCV(estimator = mlp, param_grid= parameter, 
                           scoring = "accuracy", verbose=True)
model_mlp = grid_search.fit(X_std, y)

### 3.7 Support Vector Machine

In [None]:
# Find the optimal gamma
from sklearn.svm import SVC
for i in range(1,11):
    svm_rbf = SVC(kernel = "rbf", random_state=0, C =0.5, gamma = i)
    model_rbf = svm_rbf.fit(X_train,y_train)
    scores = cross_val_score(model_rbf,X=X_test, y=y_test, cv=10)
    print("gamma = ",i,", score = ", sum(scores)/len(scores))

In [None]:
# Build the SVM model using a linear model
from sklearn.svm import SVC
svm = SVC(kernel="linear", random_state=0, C=0.5, gamma = 3)
model_svm = svm.fit(X_train, y_train)

### Model Performance Comparison

In [None]:
from sklearn import metrics

def model_metrics(model,X,y):
    y_pred = model.predict(X)

    accuracy = metrics.accuracy_score(y, y_pred)
    precision = metrics.precision_score(y, y_pred)
    recall = metrics.recall_score(y, y_pred)
    f1_score = metrics.f1_score(y, y_pred)

    model_metrics = [accuracy, precision, recall, f1_score]
    return model_metrics

In [None]:
model_performance = {
    'Logitstic': model_metrics(model_logit,X_test,y_test),
    'KNN': model_metrics(model_knn,X_test,y_test),

    'DecisionTree': model_metrics(model_dt,X_test,y_test),
    'RandomForest': model_metrics(model_rf,X_test,y_test),
    'GradientBoosting': model_metrics(model_gbt,X_test,y_test),

    'ANN': model_metrics(model_ann,X_test,y_test),
    'SVM': model_metrics(model_svm,X_test,y_test)    
}

pd.DataFrame.from_dict(model_performance, orient='index',columns = ['accuracy','percision','recall','f1_score'])

### Lazy Classifier

In [None]:
import lazypredict
from lazypredict.Supervised import LazyClassifier

clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)

print(models)

# 4. Evaluation on Grading Dataset

In [None]:
# Loading libraries
import pandas as pd
import numpy as np

# Import the dataset
grading = pd.read_excel('Kickstarter-Grading-Sample.xlsx')

# For the purpose of this project, we only need to include projects with "successful" or "failure" state
df_test = grading[grading['state'].isin(['successful','failed'])]

# Data Preprocessing
df_test = df_test.dropna()

### Drop out-of-scope predictors
df_test = df_test.drop(columns = ['state_changed_at','state_changed_at_weekday','state_changed_at_month', 
'state_changed_at_day', 'state_changed_at_yr','state_changed_at_hr','launch_to_state_change_days',
'pledged','staff_pick','backers_count','spotlight','disable_communication',
'id','name','deadline', 'created_at', 'launched_at','usd_pledged','name_len_clean','created_at_yr', 'launched_at_yr'])

### Handle Categorical Variables
cols = ['deadline_weekday','created_at_weekday','launched_at_weekday']
df_test[cols] = df_test[cols].replace(['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'],[1,2,3,4,5,6,7])

# dummify the other categorical variables
df_test = pd.get_dummies(df_test, columns = ['country','currency','category'])
# or just drop them
#df = df.drop(columns = ['country','currency','category'])

df_test['state'] = df_test['state'].replace(['successful','failed'],[1,0])


# Testing
#X_grade = df_test[['goal','create_to_launch_days','name_len','launch_to_deadline_days',
#'category_Web','category_Software','category_Plays','category_Festivals']]

#X_grade = df_test[['goal','create_to_launch_days','name_len','launch_to_deadline_days',
#'launched_at_hr','launched_at_day','created_at_day','created_at_hr','deadline_day','blurb_len',
# 'category_Web','created_at_month','deadline_month','created_at_weekday','launched_at_weekday','deadline_yr',
# 'category_Software','static_usd_rate','category_Plays','category_Festivals','category_Hardware','category_Musical']]

# X_grade = df_test[['category_Web','category_Software','category_Plays','name_len','launch_to_deadline_days','deadline_yr']] #,'category_Festivals','category_Musical']]

X_grade = df_test[['category_Web','category_Software','category_Plays','name_len','launch_to_deadline_days','deadline_yr',
 'category_Festivals','category_Musical','category_Shorts','category_Experimental','category_Places',
 'category_Immersive', 'launched_at_hr']]
y_grade = df_test["state"]

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_grade = scaler.fit_transform(X_grade)


### PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
pca.fit(X_std)
#X_grade = pca.transform(X_grade)

In [None]:
test_performance = {
    'Logitstic': model_metrics(model_logit,X_grade,y_grade),
    'KNN': model_metrics(model_knn,X_grade,y_grade),

    'DecisionTree': model_metrics(model_dt,X_grade,y_grade),
    'RandomForest': model_metrics(model_rf,X_grade,y_grade),
    'GradientBoosting': model_metrics(model_gbt,X_grade,y_grade),

    'ANN': model_metrics(model_ann,X_grade,y_grade),
    'SVM': model_metrics(model_svm,X_grade,y_grade)
}

pd.DataFrame.from_dict(test_performance, orient='index',columns = ['accuracy','percision','recall','f1_score'])