In [172]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.feature_extraction import text
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

In [173]:
#load topics, users and product posts
topics = pd.read_csv('./data/AllTopics.csv',sep=',')
posts = pd.read_csv('./data/PostsForAnalysis.txt',sep=',')
users = pd.read_csv('./data/UsersForAnalysis.txt',sep=',')

In [169]:
#merge posts and users
df = pd.merge(posts,users,on='user_id',how='left',suffixes=('_posts', '_users'))

In [170]:
#select the key features
df1 = df[['day','time_of_day','product_state','num_makers','num_topics','collections_count','followed_topics_count',
          'followers_count','maker_of_count','posts_count','votes_count_users']]

In [171]:
#handle missing values. Also replace categoical values with numbers using encoding.
df1['votes_count_users'].fillna(0, inplace=True)
df1['collections_count'].fillna(0, inplace=True)
df1['followed_topics_count'].fillna(0, inplace=True)
df1['followers_count'].fillna(0, inplace=True)
df1['maker_of_count'].fillna(0, inplace=True)
df1['posts_count'].fillna(0, inplace=True)
df1.day = LabelEncoder().fit_transform(df1.day) 
df1.time_of_day = LabelEncoder().fit_transform(df1.time_of_day) 
df1.product_state = LabelEncoder().fit_transform(df1.product_state) 


In [163]:
#define target. All products with votes above the 75 percentile are considered successful products.
votes_75_perc = np.percentile(df1.votes_count_users, 75)
df1.loc[:,'product_success'] = np.where(df1.votes_count_users>=votes_75_perc,1,0)
#delete the column used to generate the target. This is to prevent leaking.
df1 = df1.drop('votes_count_users', 1)

In [164]:
X = df1
y = X.pop('product_success')

In [165]:
#svm gives the best result for this dataset.
svr_rbf = SVC(kernel='rbf', C=1e3, gamma=0.1)

In [166]:
print 'accuracy: ', np.mean(cross_val_score(svr_rbf, X, y,cv=3,scoring = 'accuracy'))
print 'recall: ', np.mean(cross_val_score(svr_rbf, X, y,cv=3,scoring = 'recall'))

accuracy:  0.986099418301
recall:  0.94633689545
