In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd
import xgboost as xgb
from scipy.sparse import *
from scipy.sparse import hstack

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline      

# machine learning
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score
from sklearn.utils import shuffle
from sklearn.cross_validation import KFold



In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
from datetime import datetime

In [4]:
from string import punctuation
from nltk.corpus import stopwords
from nltk import word_tokenize
 
punct=['#','*','+',';','<','=', '>', '[','\\', ']','^', '_', '`', '{', '|', '}', '~']
stop_words = stopwords.words('english') + punct #list(punctuation)
 
def tokenize(text):
    words = word_tokenize(text.replace('-',' - ').replace('.',' . ').replace('~',' ~ ').replace('=',' = ').replace('+',' + ').replace('*',' * ').replace("'"," ' ").replace("_"," _ ").replace(":"," : ").replace("|"," | ").replace(u'\u2014',' - ').replace(u'\u2122',' TMK ').replace('/',' / '))
    return [w.lower() for w in words]

In [5]:
df_tot= pd.read_csv('kickstarter_clean.csv')

In [6]:
df_tot['launch']=pd.to_datetime(df_tot['launched_at'], unit='s').dt.dayofyear
df_tot['deadl']=pd.to_datetime(df_tot['deadline'], unit='s').dt.dayofyear

In [7]:
df_tot['len_name']=df_tot['name'].apply(lambda x: len(str(x)))
df_tot['len_blurb']=df_tot['blurb'].apply(lambda x: len(str(x)))
df_tot['dur']=np.mod(df_tot['deadl']-df_tot['launch'],365)

In [8]:
df_tot['tiblu']=df_tot['name']+' '+df_tot['blurb']

In [9]:
df_tot['main_cat'].value_counts()

music           20289
film & video    17522
art             13172
publishing      12456
technology      11514
food            11135
games            7174
fashion          6796
comics           5709
theater          4484
design           4316
photography      3888
crafts           3863
journalism       2656
dance            2398
Name: main_cat, dtype: int64

In [10]:
def main(cat):
    df=df_tot[(df_tot['main_cat']==cat)].reset_index()
    df['sub_cat']=df['sub_cat'].astype('category')
    df['cat_num'] = pd.Categorical(df['sub_cat']).codes
    df['state_num'] = pd.Categorical(df['state'].astype('category')).codes
    df=df.sample(frac=1)
    df=df.reset_index()
    #get extra features
    x_add=np.append(np.array(df['launch']),np.array(df['deadl']))
    x_add=np.append(x_add,np.array(df['len_name']))
    x_add=np.append(x_add,np.array(df['len_blurb']))
    x_add=np.append(x_add,np.array(df['dur']))
    x_add=np.append(x_add,np.array(df['goal']))
    x_add=np.append(x_add,np.array(df['state_num']))
    x_add=np.append(x_add,np.array(df['cat_num'])).reshape(8,len(df))
   
    #get k_fold 
    k_fold = KFold(len(df), n_folds=6)
    
    f1_scores = []
    conf= np.array([[0, 0], [0, 0]])
    
    n=0
    
    for train_i, test_i in k_fold:
        n+=1
        print(n)
         #build corpus
        texts=df['tiblu'].apply(lambda x: str(x).decode('utf-8'))
        # build the vocabulary
        vocabulary=set.union(*[set(tokenize(texts[i])) for i in train_i])
        #tfidf = TfidfVectorizer(stop_words=stop_words, tokenizer=tokenize, vocabulary=vocabulary)
        tfidf = TfidfVectorizer(min_df=5, max_df = 1, sublinear_tf=True, use_idf=True,stop_words=stop_words, tokenizer=tokenize, vocabulary=vocabulary)
        
        # Fit the TfIdf model
        tfidf.fit([texts[i] for i in train_i])
        
        x_train_tfidf=tfidf.transform([texts[i] for i in train_i])
        x_test_tfidf=tfidf.transform([texts[i] for i in test_i])
        x_add_train=np.transpose(x_add[:,train_i])
        x_add_test=np.transpose(x_add[:,test_i])
        x_train=hstack((x_train_tfidf, x_add_train))
        x_test=hstack((x_test_tfidf, x_add_test))
        
        y_train=df['y'].iloc[train_i]
        y_test=df['y'].iloc[test_i]
        
        
        #fit using XGBoost classifier
        regrxgb =xgb.XGBClassifier(n_estimators=500, max_depth=5, learning_rate=0.1,min_child_weight=2,colsample_bytree= 0.7, subsample= 0.9)
        regrxgb.fit(x_train, y_train)
        
        y_xgb = regrxgb.predict(x_test)
        
        f1_scores.append(f1_score(y_test, y_xgb))
        
        conf += confusion_matrix(y_test, y_xgb)
    
    return np.round(100*(sum(f1_scores)/6))/100, conf    

In [11]:
def main_ft(cat):
    df=df_tot[(df_tot['main_cat']==cat)].reset_index()
    df['sub_cat']=df['sub_cat'].astype('category')
    df['cat_num'] = pd.Categorical(df['sub_cat']).codes
    df['state_num'] = pd.Categorical(df['state'].astype('category')).codes
    df=df.sample(frac=1)
    df=df.reset_index()
    #get extra features
    x_add=np.append(np.array(df['launch']),np.array(df['deadl']))
    x_add=np.append(x_add,np.array(df['len_name']))
    x_add=np.append(x_add,np.array(df['len_blurb']))
    x_add=np.append(x_add,np.array(df['dur']))
    x_add=np.append(x_add,np.array(df['goal']))
    x_add=np.append(x_add,np.array(df['state_num']))
    x_add=np.append(x_add,np.array(df['cat_num'])).reshape(8,len(df))
   
    #get k_fold 
    k_fold = KFold(len(df), n_folds=6)
    
    f1_scores = []
    conf= np.array([[0, 0], [0, 0]])
    
    n=0
    
    for train_i, test_i in k_fold:
        n+=1
        print(n)
        
        x_train=np.transpose(x_add[:,train_i])
        x_test=np.transpose(x_add[:,test_i])
        
        y_train=df['y'].iloc[train_i]
        y_test=df['y'].iloc[test_i]
        
        #fit using XGBoost classifier
        regrxgb =xgb.XGBClassifier(n_estimators=500, max_depth=5, learning_rate=0.1,min_child_weight=2,colsample_bytree= 0.7, subsample= 0.9)

        regrxgb.fit(x_train, y_train)
        
        y_pred = regrxgb.predict(x_test)
        
        f1_scores.append(f1_score(y_test, y_pred))
        
        conf += confusion_matrix(y_test, y_pred)
    
    return np.round(100*(sum(f1_scores)/6))/100, conf    

In [12]:
def main_tfidf(cat):
    df=df_tot[(df_tot['main_cat']==cat)].reset_index()
    df=df.sample(frac=1)
    df=df.reset_index()
   
    #get k_fold 
    k_fold = KFold(len(df), n_folds=6)
    
    f1_scores = []
    conf= np.array([[0, 0], [0, 0]])
    
    n=0
    
    for train_i, test_i in k_fold:
        n+=1
        print(n)
         #build corpus
        texts=df['tiblu'].apply(lambda x: str(x).decode('utf-8'))
        # build the vocabulary
        vocabulary=set.union(*[set(tokenize(texts[i])) for i in train_i])
        tfidf = TfidfVectorizer(min_df=5, max_df = 1, sublinear_tf=True, use_idf=True,stop_words=stop_words, tokenizer=tokenize, vocabulary=vocabulary)
        
        # Fit the TfIdf model
        tfidf.fit([texts[i] for i in train_i])
        
        x_train=tfidf.transform([texts[i] for i in train_i])
        x_test=tfidf.transform([texts[i] for i in test_i])
        
        y_train=df['y'].iloc[train_i]
        y_test=df['y'].iloc[test_i]
        
        
        #fit using XGBoost classifier
        regrxgb =xgb.XGBClassifier(n_estimators=500, max_depth=5, learning_rate=0.1,min_child_weight=2,colsample_bytree= 0.7, subsample= 0.9)
        
        regrxgb.fit(x_train, y_train)
        
        y_xgb = regrxgb.predict(x_test)
        
        f1_scores.append(f1_score(y_test, y_xgb))
        
        conf += confusion_matrix(y_test, y_xgb)
    
    return np.round(100*(sum(f1_scores)/6))/100, conf    

In [22]:
def f1s(conf):
    return np.round(200.*conf[1,1]/(2*conf[1,1]+conf[0,1]+conf[1,0]))/100

In [19]:
output_file = open('scores_table.dat','w')

cat_list=sorted(df_tot['main_cat'].unique())

tconf_tfidf=np.array([[0, 0], [0, 0]])
tconf_ft=np.array([[0, 0], [0, 0]])
tconf_tot=np.array([[0, 0], [0, 0]])


for cat in cat_list:
    f1_tfidf, conf_tfidf= main_tfidf(cat)
    f1_ft, conf_ft= main_ft(cat)
    f1_tot, conf_tot= main(cat)
    
    tconf_tfidf+= conf_tfidf
    tconf_ft+= conf_ft
    tconf_tot+= conf_tot

    print >> output_file, cat.title() + ' | ' +str(f1_tfidf) + ' | '+ str(f1_ft) + ' | '+ str(f1_tot) 
    
    
print >> output_file, 'All' + ' | ' +str(f1s(tconf_tfidf)) + ' | '+ str(f1s(tconf_ft)) + ' | '+ str(f1s(tconf_tot)) 
    
output_file.close()

1


  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6

  if diff:



1


  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6


  if diff:


1


  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6


  if diff:


1


  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6


  if diff:


1


  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6


  if diff:


1


  if diff:


2


  if diff:


3

  if diff:



4


  if diff:


5


  if diff:


6


  if diff:


1


  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6


  if diff:


1


  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6


  if diff:


1


  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6


  if diff:


1


  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6


  if diff:


1


  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6


  if diff:


1


  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6


  if diff:


1


  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6


  if diff:


1


  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6


  if diff:


1


  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6


  if diff:


1


  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6


  if diff:


1


  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6


  if diff:


1


  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6


  if diff:


1


  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6


  if diff:


1


  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6


  if diff:


1


  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6


  if diff:


1


  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6


  if diff:


1


  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6


  if diff:


1


  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6


  if diff:


1


  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6


  if diff:


1


  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6


  if diff:


1


  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6


  if diff:


1


  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6


  if diff:


1


  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6


  if diff:


1


  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6


  if diff:


1


  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6


  if diff:


1


  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6


  if diff:


1


  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6


  if diff:


1


  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6


  if diff:


1


  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6


  if diff:


1


  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6


  if diff:


1


  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6


  if diff:


1


  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6


  if diff:


1


  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6


  if diff:


1


  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6


  if diff:


1


  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6


  if diff:


1


  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6


  if diff:


1


  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6


  if diff:


1


  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6


  if diff:


1


  if diff:


2


  if diff:


3


  if diff:


4


  if diff:


5


  if diff:


6


  if diff:


AttributeError: 'str' object has no attribute 'write'

In [20]:
output_file.close()

In [23]:
print 'All' + ' | ' +str(f1s(tconf_tfidf)) + ' | '+ str(f1s(tconf_ft)) + ' | '+ str(f1s(tconf_tot))

All | 0.74 | 0.79 | 0.81
