In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Visualization
import seaborn as sns 
import re
import matplotlib.pyplot as plt
# Datetime
from datetime import datetime
# Sklearn import
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import preprocessing
# Text processing
from textblob import TextBlob
import string 

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

In [None]:
train_data = pd.read_csv("../input/train.csv")

In [None]:
# Overview
train_data.head()

In [None]:
train_data.info()

In [None]:
# Find if any entries are null
for i in train_data.columns:
    print(i, train_data[i].isnull().sum().sum())

In [None]:
# Fill in missing data by empty string
train_data['name'].fillna(" ")
train_data['desc'].fillna(" ")

In [None]:
# Convert UNIX time format to standard time format
date_column = ['deadline', 'state_changed_at', 'created_at', 'launched_at']
for i in date_column:
    train_data[i]=train_data[i].apply(lambda x: datetime.fromtimestamp(int(x)).strftime("%Y-%m-%d %H:%M:%S"))

**Preliminary exploration**
- How many projects got funded successfullly?
- How much is the goal? Can we/should we decompose the goal into bins or use numerical value?
- Are there any correlations between the currency and the status of funded projects? (ie. Is USD more favorable?)
- Are there any correlations between the country and the status of funded projects? (ie. Is a project in the US more likely to be funded?)
- Distribution of funded/not funded projects over years/months?


In [None]:
# Distribution of funded projects
sns.countplot(x='final_status',data=train_data)
plt.show()

Most of the goal is less than 20,000,000. There are some outliers which have detorted the goal features. These outliers might need to be removed.  After removing outliers, a few observations:
1. Most projects have very small goal
2. Most funded projects have goal less than 20000

In [None]:
# Distribution of goals
sns.distplot(train_data['goal'], bins=5)
plt.show()

In [None]:
train_data['goal'].describe()

In [None]:
#Remove some of the outliers and replot the histograms
P = np.percentile(train_data['goal'], [0, 95])
new_goal = train_data[(train_data['goal'] > P[0]) & (train_data['goal'] < P[1])]

In [None]:
sns.distplot(new_goal['goal'], bins=5)
plt.show()

In [None]:
# Log-transform goal without excluding outliers 
sns.distplot(np.log(train_data['goal']), bins=5)
plt.show()

In [None]:
g = sns.FacetGrid(new_goal, col='final_status')
g.map(plt.hist, 'goal', bins = 40)
plt.show()

It does looks like most of the projects got fully funded are the ones asking for less than 20,000. Next, we will see if that goal feature has any multivariate correlation with country or currency.

In [None]:
g = sns.FacetGrid(new_goal, col="final_status",  row="country")
g = g.map(plt.hist, "goal", bins = 40)
plt.show()

In [None]:
#non_us = new_goal[new_goal['country'] != 'US']
train_data['log_goal'] = np.log(train_data['goal'])
g = sns.FacetGrid(train_data, col="final_status",  row="country")
g = g.map(plt.hist, "log_goal", bins = 40)

It doesn't look like **disable_communication** is a good feature to include, if at all. Most of the projects in this dataset are communication-disabled. However, a chi-square test needs to be done. It might be because of communication that a project is not funded.

In [None]:
# Explore the effect of disable_communication
figure, axes = plt.subplots(1, 2, sharey=True)
sns.countplot(x='disable_communication',data=train_data, hue='final_status', ax = axes[0])
sns.countplot(x='final_status', data= train_data, ax = axes[1])
plt.show()

In [None]:
train_data['disable_communication'].describe()

It does look like country and currency of project impacts whether or not a project got funded. Since they are nomial data, we will need to convert them into one-hot encoding. 

In [None]:
figure, axes = plt.subplots(2)
sns.countplot(x='country',data=train_data, hue='final_status', ax = axes[0])
sns.countplot(x='currency',data=train_data, hue='final_status', ax = axes[1])
plt.show()

In [None]:
figure, axes = plt.subplots(2)
sns.countplot(x='country',data=train_data, ax = axes[0])
sns.countplot(x='currency',data=train_data, ax = axes[1])
plt.show()

From these plots and from the above goal conditioned on currency plots, we can safely remove SEK, NOK, SKK because they don't add into the prediction. Removing these data points also makes one-hot encoding cleaner. 

Next, we investigate the **number of backers** and its relationship to project status. 

In [None]:
# Understand the distribution of backers using box-plot
ax = sns.boxplot(x=train_data["backers_count"])


In [None]:
#Remove some of the outliers and replot the histograms
P_backer = np.percentile(train_data['backers_count'], [0, 95])
new_backers = train_data[(train_data['backers_count'] > P_backer[0]) & (train_data['backers_count'] < P_backer[1])]

In [None]:
ax = sns.boxplot(x=new_backers["backers_count"])

In [None]:
new_backers.shape

Next, we investigate whether there is any correlation between the number of backers and the goals, countries, and finally the project status.

- It looks like the more backers the project has, the more likely it gets funded. 
- It looks like backers are linearly correlated with log(goal). 
- Since backers are very wide spread, we will need to standardize this feature.

In [None]:
# Explore the effect of disable_communication
# figure, axes = plt.subplots(1, 2, sharey=True)
sns.countplot(x='backers_count',data=new_backers, hue='final_status')
plt.xticks([],[])
# sns.countplot(x='final_status', data= train_data, ax = axes[1])
plt.show()

In [None]:
g = sns.FacetGrid(new_backers, col="final_status",  row="country")
g = g.map(plt.hist, "backers_count", bins = 40)

In [None]:
plt.scatter(new_backers['backers_count'], np.log(new_backers['goal']), alpha = 0.3)
plt.xlabel('backers count')
plt.ylabel('log goal')
plt.show()

In [None]:
plt.scatter(new_backers[new_backers['final_status'] == 1]['backers_count'], np.log(new_backers[new_backers['final_status'] == 1]['goal']), alpha = 0.3)
plt.xlabel('backers count')
plt.ylabel('log goal')
plt.title("Funded project")
plt.show()

In [None]:
plt.scatter(new_backers[new_backers['final_status'] == 0]['backers_count'], np.log(new_backers[new_backers['final_status'] == 0]['goal']), alpha = 0.3)
plt.xlabel('backers count')
plt.ylabel('log goal')
plt.title("Not funded project")
plt.show()

Then we analyze the launch time and how different features play into this big picture (Thanks Maggie)

In [None]:
# with respect to launched time 
def countQuarter(dt):
    month = int(dt[5:7])
    if month <= 3: return '01'
    elif month <= 6:return '02'
    elif month <= 9: return '03'
    else: return '04'

train_data['launched_month'] = train_data['launched_at'].apply(lambda dt: dt[5:7])
train_data['launched_year'] = train_data['launched_at'].apply(lambda dt: dt[0:4])
train_data['launched_quarter'] = train_data['launched_at'].apply(lambda dt: countQuarter(dt))

In [None]:
figure, axes = plt.subplots(3)
sns.countplot(x='launched_month',data=train_data, hue='final_status', ax = axes[0])
sns.countplot(x='launched_year',data=train_data, hue='final_status', ax = axes[1])
sns.countplot(x='launched_quarter',data=train_data, hue='final_status', ax = axes[2])
plt.tight_layout()
plt.show()

In [None]:
def measureDuration(dt): # Duration in hours
    launch = datetime.strptime(dt[0], "%Y-%m-%d %H:%M:%S")
    deadline = datetime.strptime(dt[1], "%Y-%m-%d %H:%M:%S")
    difference = deadline-launch
    hr_difference = int (difference.total_seconds() / 3600)
    return hr_difference

train_data['duration'] = train_data[['launched_at', 'deadline']].apply(lambda dt: measureDuration(dt), axis=1)

In [None]:
sns.distplot(train_data['duration'], bins=5)
plt.show()

In [None]:
def measureDurationByWeek(dt):
    # count by hr / week 
    week = 168 
    return int (dt / 168)

train_data['duration_weeks'] = train_data['duration'].apply(lambda dt: measureDurationByWeek(dt))

In [None]:
sns.countplot(x='duration_weeks', data=train_data, hue='final_status')
plt.show()

In [None]:
train_data.head()

In [None]:
train_data.info()

Now, I will attempt to train a logistic regression model on the following features:
- log-transformed goal ('log_goal')
- one-hot encoded countries (with all countries for now) ('country')
- one-hot encoded currency ('currency')
- backers-count (keep raw for now) ('backers_count')
- one-hot encoded launched years ('launched_year')
- duration weeks ('duration_weeks')

In [None]:
def getFeatures(x_features, y_feature): 
    X = train_data[x_features]
    y = train_data[y_feature]
    return X, y

def splitData(X, y, size): 
    onehot_X = pd.get_dummies(X)
    X_train, X_test, y_train, y_test = train_test_split(onehot_X, y, test_size=size, random_state = 42)
    return X_train, X_test, y_train, y_test
    
def makeLogisticRegression(X_train, y_train): 
    lr = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')
    clf = lr.fit(X_train, y_train)
    return lr, clf 

def accuracy(clf, X_train, X_test, y_train, y_test):
    return clf.score(X_train, y_train), clf.score(X_test, y_test)

In [None]:
x_features = ['log_goal','country','currency', 'backers_count', 'launched_year','duration_weeks']
y_feature = 'final_status'

In [None]:
X, y = getFeatures(x_features, y_feature)
X_train, X_test, y_train, y_test = splitData(X, y, 0.2)
lr, clf = makeLogisticRegression(X_train, y_train)
train_score, test_score = accuracy(clf, X_train, X_test, y_train, y_test)

In [None]:
train_score

In [None]:
test_score

In [None]:
x_features = ['log_goal','country','currency', 'backers_count', 'launched_month', 'launched_year','duration_weeks']
y_feature = 'final_status'

In [None]:
X, y = getFeatures(x_features, y_feature)
X_train, X_test, y_train, y_test = splitData(X, y, 0.2)
lr, clf = makeLogisticRegression(X_train, y_train)
train_score, test_score = accuracy(clf, X_train, X_test, y_train, y_test)

In [None]:
train_score

In [None]:
test_score

In [None]:
lr.coef_

Additional data to parse include:

* Length of project name
* Parse name in alphabetical order
* Parse keywords
* Check for currency appreciation / depreciation
* Holiday season
* Election / Voting season
* World Cup / Olympics
* NBA / MLB / Football season

In [None]:
# Length of Project Name 
train_data['name_length'] = train_data['name'].apply(lambda name: len(str(name)))

In [None]:
sns.countplot(x='name_length', data=train_data, hue='final_status')
plt.show()

In [None]:
P = np.percentile(train_data['name_length'], [5, 95])
parsed_name = train_data[(train_data['name_length'] > P[0]) & (train_data['name_length'] < P[1])]

sns.distplot(parsed_name['name_length'], bins=10)
plt.show()

In [None]:
# Project name in alphabetical order
def parseName(name):
    if str(name)[0] not in string.ascii_lowercase + string.ascii_uppercase: 
        return '*'
    else:
        return str(name)[0].lower()

train_data['alpha_order'] = train_data['name'].apply(lambda name: parseName(name))
sns.countplot(x='alpha_order', data=train_data, hue='final_status')
plt.show()

In [None]:
x_features = ['log_goal','country', 'currency', 'backers_count', 'launched_year', 'launched_month', 'duration_weeks', 'name_length', 'alpha_order']
y_feature = 'final_status'

In [None]:
X, y = getFeatures(x_features, y_feature)
X_train, X_test, y_train, y_test = splitData(X, y, 0.2)
lr, clf = makeLogisticRegression(X_train, y_train)
train_score, test_score = accuracy(clf, X_train, X_test, y_train, y_test)

In [None]:
train_score

In [None]:
test_score

In [None]:
# Keyword Search 

buzzwords = ['app', 'platform', 'technology', 'service', 'solution', 'data', 
            'manage', 'market', 'help', 'mobile', 'users', 'system', 'software', 
           'customer', 'application', 'online', 'web', 'create', 'health', 
           'provider', 'network', 'cloud', 'social', 'device', 'access']

def countBuzzwords(desc):
    lowerCase = str(desc).lower() 
    count = 0
    for bw in buzzwords: 
        count += lowerCase.count(bw)
    return count 
    
train_data['buzzword_count'] = train_data['desc'].apply(lambda d: countBuzzwords(d))

In [None]:
sns.countplot(x='buzzword_count', data=train_data, hue='final_status')
plt.show()

In [None]:
x_features = ['log_goal','country', 'currency', 'backers_count', 'launched_year', 'launched_month', 'duration_weeks', 'name_length', 'alpha_order', 'buzzword_count']
y_feature = 'final_status'

In [None]:
X, y = getFeatures(x_features, y_feature)
X_train, X_test, y_train, y_test = splitData(X, y, 0.2)
lr, clf = makeLogisticRegression(X_train, y_train)
train_score, test_score = accuracy(clf, X_train, X_test, y_train, y_test)

In [None]:
train_score

In [None]:
test_score

In [None]:
x_features = ['log_goal','country', 'currency', 'backers_count', 'launched_year', 'launched_month', 'duration_weeks', 'buzzword_count']
y_feature = 'final_status'

In [None]:
X, y = getFeatures(x_features, y_feature)
X_train, X_test, y_train, y_test = splitData(X, y, 0.2)
lr, clf = makeLogisticRegression(X_train, y_train)
train_score, test_score = accuracy(clf, X_train, X_test, y_train, y_test)

In [None]:
train_score

In [None]:
test_score