In [1]:
# Import Dependencies
%matplotlib inline

# Start Python Imports
import math, time, random, datetime

# Data Manipulation
import numpy as np
import pandas as pd

# Visualization 
import matplotlib.pyplot as plt
import missingno
import seaborn as sns
plt.style.use('seaborn-whitegrid')

# Preprocessing
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, label_binarize

# Machine learning
from sklearn.model_selection import train_test_split
from sklearn import model_selection, tree, preprocessing, metrics, linear_model
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier


In [2]:
# Import the entire dataset
df = pd.read_csv('data/ks-projects-201801.csv')

df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0


In [3]:
#check datatypes of attributes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378661 entries, 0 to 378660
Data columns (total 15 columns):
ID                  378661 non-null int64
name                378657 non-null object
category            378661 non-null object
main_category       378661 non-null object
currency            378661 non-null object
deadline            378661 non-null object
goal                378661 non-null float64
launched            378661 non-null object
pledged             378661 non-null float64
state               378661 non-null object
backers             378661 non-null int64
country             378661 non-null object
usd pledged         374864 non-null float64
usd_pledged_real    378661 non-null float64
usd_goal_real       378661 non-null float64
dtypes: float64(5), int64(2), object(8)
memory usage: 43.3+ MB


In [4]:
#sum count of nulls per attribute
df.isnull().sum()

ID                     0
name                   4
category               0
main_category          0
currency               0
deadline               0
goal                   0
launched               0
pledged                0
state                  0
backers                0
country                0
usd pledged         3797
usd_pledged_real       0
usd_goal_real          0
dtype: int64

In [5]:
df['state'].value_counts()

failed        197719
successful    133956
canceled       38779
undefined       3562
live            2799
suspended       1846
Name: state, dtype: int64

## Data Cleaning and EDA

In [6]:
#converting launch string to datetime type
df['launched_timestamp'] = pd.to_datetime(df['launched'], infer_datetime_format=True)
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,launched_timestamp
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95,2015-08-11 12:12:28
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0,2017-09-02 04:43:57
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0,2013-01-12 00:20:50
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0,2012-03-17 03:24:11
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0,2015-07-04 08:35:03


In [7]:
#converting deadline string to datatime type
df['deadline_timestamp'] = pd.to_datetime(df['deadline'], infer_datetime_format=True)
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,launched_timestamp,deadline_timestamp
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95,2015-08-11 12:12:28,2015-10-09
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0,2017-09-02 04:43:57,2017-11-01
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0,2013-01-12 00:20:50,2013-02-26
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0,2012-03-17 03:24:11,2012-04-16
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0,2015-07-04 08:35:03,2015-08-29


In [8]:
#calculating duration of each project in days
df['duration_days'] = pd.Series(delta.days for delta in (df['deadline_timestamp'] - df['launched_timestamp']))
df.head()

Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,usd_pledged_real,usd_goal_real,launched_timestamp,deadline_timestamp,duration_days
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09,1000.0,2015-08-11 12:12:28,0.0,failed,0,GB,0.0,0.0,1533.95,2015-08-11 12:12:28,2015-10-09,58
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2017-11-01,30000.0,2017-09-02 04:43:57,2421.0,failed,15,US,100.0,2421.0,30000.0,2017-09-02 04:43:57,2017-11-01,59
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26,45000.0,2013-01-12 00:20:50,220.0,failed,3,US,220.0,220.0,45000.0,2013-01-12 00:20:50,2013-02-26,44
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16,5000.0,2012-03-17 03:24:11,1.0,failed,1,US,1.0,1.0,5000.0,2012-03-17 03:24:11,2012-04-16,29
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29,19500.0,2015-07-04 08:35:03,1283.0,canceled,14,US,1283.0,1283.0,19500.0,2015-07-04 08:35:03,2015-08-29,55


In [9]:
list(df.columns.values.tolist())

['ID',
 'name',
 'category',
 'main_category',
 'currency',
 'deadline',
 'goal',
 'launched',
 'pledged',
 'state',
 'backers',
 'country',
 'usd pledged',
 'usd_pledged_real',
 'usd_goal_real',
 'launched_timestamp',
 'deadline_timestamp',
 'duration_days']

In [10]:
#dropping unneccesary columns
#dropping usd pledged since converion has errors
#dropping ID since not useful right now
df.drop(['goal','deadline', 'launched', 'usd pledged', 'launched_timestamp', 'deadline_timestamp'], axis=1, inplace=True)
df.head()

Unnamed: 0,ID,name,category,main_category,currency,pledged,state,backers,country,usd_pledged_real,usd_goal_real,duration_days
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,0.0,failed,0,GB,0.0,1533.95,58
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,USD,2421.0,failed,15,US,2421.0,30000.0,59
2,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,220.0,failed,3,US,220.0,45000.0,44
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,1.0,failed,1,US,1.0,5000.0,29
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,1283.0,canceled,14,US,1283.0,19500.0,55


In [11]:
#drop nulls
df.dropna(inplace=True)
#sum count of nulls per attribute
df.isnull().sum()

ID                  0
name                0
category            0
main_category       0
currency            0
pledged             0
state               0
backers             0
country             0
usd_pledged_real    0
usd_goal_real       0
duration_days       0
dtype: int64

In [12]:
#df.drop(['name'], axis=1, inplace=True)
#df.head()

In [13]:
df.drop(['pledged', 'currency', 'usd_pledged_real'], axis=1, inplace=True)
df.head()

Unnamed: 0,ID,name,category,main_category,state,backers,country,usd_goal_real,duration_days
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,failed,0,GB,1533.95,58
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,failed,15,US,30000.0,59
2,1000004038,Where is Hank?,Narrative Film,Film & Video,failed,3,US,45000.0,44
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,failed,1,US,5000.0,29
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,canceled,14,US,19500.0,55


In [14]:
df['success'] = np.where(df.state == 'successful', 1, 0)
df.head()

Unnamed: 0,ID,name,category,main_category,state,backers,country,usd_goal_real,duration_days,success
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,failed,0,GB,1533.95,58,0
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,failed,15,US,30000.0,59,0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,failed,3,US,45000.0,44,0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,failed,1,US,5000.0,29,0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,canceled,14,US,19500.0,55,0


In [15]:
df.drop(['state'], axis=1, inplace=True)
df.head()

Unnamed: 0,ID,name,category,main_category,backers,country,usd_goal_real,duration_days,success
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,0,GB,1533.95,58,0
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,Narrative Film,Film & Video,15,US,30000.0,59,0
2,1000004038,Where is Hank?,Narrative Film,Film & Video,3,US,45000.0,44,0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,1,US,5000.0,29,0
4,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,14,US,19500.0,55,0


In [16]:
# One hot encode the categorical columns
df_category_onehot = pd.get_dummies(df['category'], 
                                     prefix='category')

df_main_category_onehot = pd.get_dummies(df['main_category'], 
                                prefix='main_cat')

df_country_onehot = pd.get_dummies(df['country'], 
                                   prefix='country')


In [17]:
df_encoded = pd.concat([df, 
                        df_category_onehot, 
                        df_main_category_onehot, 
                        df_country_onehot], axis=1)

df_encoded.shape

(378657, 206)

In [18]:
df_orig = df_encoded[['ID', 'name', 'success']]
df_orig.head()

Unnamed: 0,ID,name,success
0,1000002330,The Songs of Adelaide & Abullah,0
1,1000003930,Greeting From Earth: ZGAC Arts Capsule For ET,0
2,1000004038,Where is Hank?,0
3,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,0
4,1000011046,Community Film Project: The Art of Neighborhoo...,0


In [19]:
df_encoded.drop(['ID', 'name', 'category', 'main_category', 'country'], axis=1, inplace=True)
df_encoded.head()

Unnamed: 0,backers,usd_goal_real,duration_days,success,category_3D Printing,category_Academic,category_Accessories,category_Action,category_Animals,category_Animation,...,country_JP,country_LU,country_MX,"country_N,0""",country_NL,country_NO,country_NZ,country_SE,country_SG,country_US
0,0,1533.95,58,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,15,30000.0,59,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,3,45000.0,44,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1,5000.0,29,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,14,19500.0,55,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


## Machine Learning

In [23]:
df_cleaned = df_encoded

In [24]:
train_set, test_set = train_test_split(df_cleaned, test_size = .2, random_state=42)

In [25]:
#splitting to data and labels
X_train = train_set.drop('success', axis=1) # data
y_train = train_set.success # labels

In [26]:
X_train.shape

(302925, 200)

In [27]:
y_train.shape

(302925,)

In [28]:
def fit_algo(algo, X_train, y_train, cv):
    
    
    model = algo.fit(X_train, y_train)
    acc = round(model.score(X_train, y_train) * 100, 2)
    
    
    train_pred = model_selection.cross_val_predict(algo, 
                                                  X_train, 
                                                  y_train, 
                                                  cv=cv, 
                                                  n_jobs = -1)
    
    acc_cv = round(metrics.accuracy_score(y_train, train_pred) * 100, 2)
    
    return train_pred, acc, acc_cv

In [29]:
lr_model = LogisticRegression()

In [30]:
# Logistic Regression
start_time = time.time()
train_pred_log, acc_log, acc_cv_log = fit_algo(lr_model, X_train, y_train, 10)
log_time = (time.time() - start_time)


print("Accuracy: %s" % acc_log)
print("Accuracy CV 10-Fold: %s" % acc_cv_log)
print("Running Time: %s" % datetime.timedelta(seconds=log_time))

Accuracy: 89.22
Accuracy CV 10-Fold: 89.21
Running Time: 0:00:22.221403


In [31]:
models = pd.DataFrame({
    'Model_Name': ['Logistic Regression'],
    'Reg_Accuracy_Score': [ 
        acc_log,  
    ]})
print("---Reuglar Accuracy Scores---")
models.sort_values(by='Reg_Accuracy_Score', ascending=False)

---Reuglar Accuracy Scores---


Unnamed: 0,Model_Name,Reg_Accuracy_Score
0,Logistic Regression,89.22


In [32]:
cv_models = pd.DataFrame({
    'Model_Name': ['Logistic Regression'],
    'CV_Accuracy_Score': [ 
        acc_cv_log,      
    ]})
print('---Cross-validation Accuracy Scores---')
cv_models.sort_values(by='CV_Accuracy_Score', ascending=False)

---Cross-validation Accuracy Scores---


Unnamed: 0,Model_Name,CV_Accuracy_Score
0,Logistic Regression,89.21


In [37]:
#splitting test data to data and labels
X_test = test_set.drop('success', axis=1) # data
y_test = test_set.success # labels

In [38]:
X_test.shape

(75732, 200)

In [39]:
y_test.shape

(75732,)

In [40]:
predictions = lr_model.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
266682,0,0
179492,1,1
179755,1,1
195374,0,0
161413,1,1
281802,0,0
357031,1,1
66123,0,0
212120,0,0
235566,1,1


In [42]:
test_accuracy = round(metrics.accuracy_score(y_test, predictions) * 100, 2)
test_accuracy

89.36

In [43]:
print(metrics.classification_report(y_test, predictions))

             precision    recall  f1-score   support

          0       0.88      0.96      0.92     48790
          1       0.92      0.77      0.84     26942

avg / total       0.90      0.89      0.89     75732

