In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
data_path ="C:/Users/saima/Desktop/kickstarter/CrowdFunding/Data/dataset/2-clean-data.csv"

In [3]:
df_raw =pd.read_csv(data_path)
df_raw['launched_at'] = pd.to_datetime(df_raw['launched_at'])
df_raw['deadline'] = pd.to_datetime(df_raw['deadline'])
df_raw['state_changed_at'] = pd.to_datetime(df_raw['state_changed_at'])

In [4]:
df = df_raw.copy()
df.dropna(inplace=True)
df = df.reset_index().drop(['index', 'Unnamed: 0'], axis=1)

### Blurb

*Description*

Let us use description as feature
- We can add the length of text 
- We can perform some basic text processing(tokenization->lematization ->tfidf) and include as feature

In [5]:
def get_blurb_length(x):
    try:
        return len(x)
    except:
        return np.nan

In [6]:
df['blurb_length'] = df['blurb'].apply(get_blurb_length)


In [7]:

cols = ['launched_at', 'deadline', 'status', 'goal',
       'sub_category', 'category', 'blurb_length', 'location_country',  'currency']
df = df[cols]
df.dropna(inplace=True)


df.sort_values("launched_at" , inplace=True)

df = df.reindex()



In [8]:
df["launched_at"]  = pd.to_datetime(df["launched_at"])
df["launched_year"]  = pd.to_datetime(df["launched_at"]).dt.year
df['launch_month'] = df['launched_at'].dt.month

df['deadline'] = pd.to_datetime(df['deadline'])
df['deadline_year'] = pd.to_datetime(df['deadline']).dt.year
df['deadline_month'] = pd.to_datetime(df['deadline']).dt.month

In [115]:
def transform(df):
    df.drop(['launched_at', 'deadline'] ,axis=1 , inplace=True)
    df.reset_index(inplace=True)
    df.drop('index', inplace=True , axis=1)

    from sklearn.preprocessing import LabelBinarizer
    binarizer= LabelBinarizer()
    df["status"] = binarizer.fit_transform(df["status"])
    
    from sklearn.preprocessing import OneHotEncoder
    encoder  = OneHotEncoder(sparse=False)
    cat_cols=['category', 'sub_category', 'currency', 'location_country']
    X_hot = encoder.fit_transform(df[cat_cols])
    
    
    onehotcols = []
    for cat in encoder.categories_:
        for col in cat:
            onehotcols.append(col)
            
    X_hot = pd.DataFrame(X_hot , columns=onehotcols)
    df =pd.concat([df , X_hot] , axis=1)
    df.drop(cat_cols , axis=1 , inplace=True)
    
    return df
    

In [128]:
def get_model_data(df, train_years , valid_years):
    df_train = df[df['launched_year'].apply(lambda x: True if x in train_years else False)]
    df_valid= df[df['launched_year'].apply(lambda x: True if x in valid_years else False)]
    
    X_train , y_train = df_train.drop("status" , axis=1) , df_train['status']
    X_valid , y_valid = df_valid.drop("status" , axis=1) , df_valid['status']
    
    
    return X_train, y_train, X_valid , y_valid

In [129]:
from sklearn.ensemble import RandomForestClassifier
import operator
from sklearn.ensemble import GradientBoostingClassifier


def score(X_train, X_test, y_train, y_test):
    rf_fet = {}
    gb_fet = {}
    from sklearn.ensemble import RandomForestClassifier
    rf= RandomForestClassifier(n_estimators=100, random_state=13579)
    rf.fit(X_train, y_train)
    rf_score = rf.score(X_test, y_test)
    
   
    feat_labels = X_train.columns.values
    
    for feature, acc in zip(feat_labels, rf.feature_importances_):
        rf_fet[feature] = acc
        
    rf_fet =  sorted(rf_fet.items(), key=operator.itemgetter(1), reverse=True)
  
        
    return (rf,rf_score, rf_fet)




In [130]:
train_years =[2009, 2010 , 2011, 2012 , 2013 , 2014 ,2015, 2016, 2017]
valid_years = [2018]

In [23]:
df_transformed = transform(df)
X_train, y_train , X_valid , y_valid = get_model_data(df_transformed , train_years , valid_years)
_ , scores , _ = score(X_train , X_valid , y_train , y_valid)
print(scores)

0.7170204100104086


- As we can see there is a slight improovement in the accuracy

## Deadline

- We can add features like days_to_deadline , indicating number of days to deadline

In [9]:
df['days_to_deadline'] = (df['deadline'] - df['launched_at']).dt.days

In [8]:
df_transformed = transform(df)
X_train, y_train , X_valid , y_valid = get_model_data(df_transformed)
_ , scores , _ = score(X_train , X_valid , y_train , y_valid)
print(scores)

NameError: name 'transform' is not defined

- Accuracy of the model increased by about 2% which is great


**Lets check how good our random forest is doing compared to Logistic Regerssion**

In [40]:
from sklearn.linear_model import LogisticRegression 

def logistic_score(X_train , y_train , X_valid , y_valid):
    model = LogisticRegression()
    model.fit(X_train , y_train)
    
    return model.score(X_valid , y_valid)

**Though 73% is not bad, its still not 80%, so i guess we've got to revisit our domain knowledge**

- There we can see that structure of the rewards matter significantly so let's visit the website and see what we can do(scrpae)

<img src="images/scrape.jpg">

- As we can see above html elements containing the rewards are present in the form of distinct cards which means they can be scrapped(tunrs out they can be)

- While i was searhcing for some other things i could extract which would add value, i came across delivery dates(which is Expected month of delivery of rewards), Well putting yourself in the shoes of someone looking for backing the project for rewards that would definitly influence you opinion

- In the domain knowledge section we can see social media presence and marketing as an important factor for success(though we an get crazy and start scraping twitter, reddit etc, for the product's digital presence
- I actually found something which also could be of help, which is experience of the creator in the form of number of projects backed and number of projects created, 

<img src="images/creator_info.jpg">


- Though this can not be directly attributed to social presence but we can assume that more the experince of the creator better would be his/her skills of marketing
- So i also decided to scrape that

**To round up following are information for each project i would be scrapping**
- Rewards
- Delivery date
- Number of projects creator has previosuly created and has backed

- You can check out the scraping scripts in the scrape_scripts folder

**Let us save the dataset for future use**

In [4]:
df_raw =pd.read_csv(data_path)
df_raw['launched_at'] = pd.to_datetime(df_raw['launched_at'])
df_raw['deadline'] = pd.to_datetime(df_raw['deadline'])
df_raw['state_changed_at'] = pd.to_datetime(df_raw['state_changed_at'])

df = df_raw.copy()
df.dropna(inplace=True)
df = df.reset_index().drop(['index', 'Unnamed: 0'], axis=1)


def get_blurb_length(x):
    try:
        return len(x)
    except:
        return np.nan
df['blurb_length'] = df['blurb'].apply(get_blurb_length)




df.sort_values("launched_at" , inplace=True)

df = df.reindex()


df["launched_at"]  = pd.to_datetime(df["launched_at"])
df["launch_year"]  = pd.to_datetime(df["launched_at"]).dt.year
df['launch_month'] = df['launched_at'].dt.month

df['deadline'] = pd.to_datetime(df['deadline'])
df['deadline_year'] = pd.to_datetime(df['deadline']).dt.year
df['deadline_month'] = pd.to_datetime(df['deadline']).dt.month



df['days_to_deadline'] = (df['deadline'] - df['launched_at']).dt.days

In [5]:
df.to_csv("C:/Users/saima/Desktop/kickstarter/CrowdFunding/Data/dataset/3_feature_engineerd_data.csv")