In [1]:
################
################
##read in data##
################
################
import pandas as pd, numpy as np
train   = pd.read_csv("labeledTrainData.tsv"  , header=0, encoding='utf-8', quoting=3, delimiter='\t')
unlabel = pd.read_csv("unlabeledTrainData.tsv", header=0, encoding='utf-8', quoting=3, delimiter='\t')
test    = pd.read_csv("testData.tsv"          , header=0, encoding='utf-8', quoting=3, delimiter='\t')

In [2]:
###################################
###################################
##       bag of words            ##
###################################
###################################
from auxiliary import *


def get_clean_documents(data,text_name,remove_stopwords=False):
    clean_documents = []
    for document in data[text_name]:
        clean_documents.append(" ".join(document_to_wordlist(document,remove_stopwords)))
    return clean_documents
        
        
clean_train_reviews   = get_clean_documents(train  ,'review',remove_stopwords=True)
clean_unlabel_reviews = get_clean_documents(unlabel,'review',remove_stopwords=True)
clean_test_reviews    = get_clean_documents(test   ,'review',remove_stopwords=True)

In [3]:
###################################
#               Vectorizing       #
###################################
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer  = TfidfVectorizer(min_df=2, max_df=0.95, max_features=500000, ngram_range=(1,4), sublinear_tf=True)
vectorizer  = vectorizer.fit(clean_train_reviews + clean_unlabel_reviews)
X_train_bow = vectorizer.transform(clean_train_reviews)
X_test_bow  = vectorizer.transform(clean_test_reviews )

In [4]:
# --------- try logistic regression ---------
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(class_weight="auto")
clf.fit(X_train_bow, train['sentiment'])
p_bow_lr = clf.predict_proba(X_test_bow)[:,1]
output = pd.DataFrame( data = { "id": test["id"], "sentiment": p_bow_lr} )
output.to_csv('y_bow_lr.csv', index = False, quoting = 3 )
# 0.96145

In [5]:
# -------- try naive bayes ---------
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB(alpha=0.0005)
nb.fit( X_train_bow, train["sentiment"] )
p_bow_nb = nb.predict_proba( X_test_bow )[:,1]
output = pd.DataFrame( data = { "id": test["id"], "sentiment": p_bow_nb} )
output.to_csv('y_bow_nb.csv', index = False, quoting = 3 )
# 0.94721

In [6]:
# ---------- try SGD --------
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier(loss='modified_huber', n_iter=100, random_state=0, shuffle=True)
sgd.fit( X_train_bow, train["sentiment"] )
p_bow_sgd = sgd.predict_proba( X_test_bow )[:,1]
output = pd.DataFrame( data = { "id": test["id"], "sentiment": p_bow_sgd} )
output.to_csv('y_bow_sgd.csv', index = False, quoting = 3 )
# 0.96789

In [7]:
# --------- adaboost -----------
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier()
ada.fit( X_train_bow, train["sentiment"] )
p_bow_ada = ada.predict_proba( X_test_bow )[:,1]
output = pd.DataFrame( data = { "id": test["id"], "sentiment": p_bow_ada} )
output.to_csv('y_bow_ada.csv', index = False, quoting = 3 )
# 0.90840 for y, 0.88681 for proba

In [8]:
# ------- tree based ---------
from sklearn import tree
tree = tree.DecisionTreeClassifier()
tree.fit(X_train_bow, train["sentiment"] )
p_bow_tree = tree.predict_proba( X_test_bow )[:,1]
output = pd.DataFrame( data = { "id": test["id"], "sentiment": p_bow_tree} )
output.to_csv('y_bow_tree.csv', index = False, quoting = 3 )
# 0.71384

In [9]:
########################
#ensembling through bow#
########################
output = pd.DataFrame( data = { "id": test["id"], "sentiment": 
                               -1.7*p_bow_lr + 3*p_bow_sgd + .25*p_bow_nb + 1.2*p_bow_ada + .03*p_bow_tree} ) #0.97043
output.to_csv('y_bow.csv', index = False, quoting = 3 )

In [10]:
#####################
#####################
##  doc2vec        ##
#####################
#####################
from gensim.models import Doc2Vec, Word2Vec
from auxiliary import *
n_dim = 3000

model_d2v_dm = Doc2Vec.load("model_d2v_dm")
model_d2v    = Doc2Vec.load("model_d2v")

train_reviews = getCleanLabeledtexts(train,'review','id')
test_reviews  = getCleanLabeledtexts(test ,'review','id')

X_train_d2v_dm   = getFeatureVecs(train_reviews, model_d2v_dm, n_dim)
X_train_d2v_dbow = getFeatureVecs(train_reviews, model_d2v, n_dim)

X_test_d2v_dm    = getFeatureVecs(test_reviews, model_d2v_dm, n_dim)
X_test_d2v_dbow  = getFeatureVecs(test_reviews, model_d2v, n_dim) 

X_train_d2v = np.hstack((X_train_d2v_dm, X_train_d2v_dbow))
X_test_d2v  = np.hstack((X_test_d2v_dm, X_test_d2v_dbow))

  '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)


25000 empty entries
25000 empty entries
25000 empty entries
25000 empty entries


  '"%s" looks like a filename, not markup. You should probably open this file and pass the filehandle into Beautiful Soup.' % markup)


In [11]:
# ------- logistic ---------
from scipy.sparse import hstack
clf = LogisticRegression()
clf.fit(hstack(( X_train_bow, X_train_d2v)), train["sentiment"])
p_bdv_lr = clf.predict_proba(hstack(( X_test_bow, X_test_d2v)))[:,1]
output = pd.DataFrame( data = { "id": test["id"], "sentiment": p_bdv_lr} )
output.to_csv('y_bdv_lr.csv', index = False, quoting = 3 )
# 0.96145

In [12]:
# -------- try SGD ---------
clf = SGDClassifier(loss='modified_huber', n_iter=100, random_state=0, shuffle=True)
clf.fit( hstack(( X_train_bow, X_train_d2v)), train["sentiment"] )
p_bdv_sgd = clf.predict_proba( hstack(( X_test_bow, X_test_d2v)) )[:,1]
output = pd.DataFrame( data = { "id": test["id"], "sentiment": p_bdv_sgd} )
output.to_csv('y_bdv_sgd.csv', index = False, quoting = 3 )
# 0.96798

In [13]:
# -------- try naive bayes ---------
clf = MultinomialNB(alpha=0.0005)
clf.fit( hstack(( X_train_bow, X_train_d2v)), train["sentiment"] )
p_bdv_nb = clf.predict_proba( hstack(( X_test_bow, X_test_d2v)) )[:,1]
output = pd.DataFrame( data = { "id": test["id"], "sentiment": p_bdv_nb} )
output.to_csv('y_bdv_nb.csv', index = False, quoting = 3 )
# 0.94721

In [14]:
####################
#ensemble d2v model#
####################
output = pd.DataFrame( data = { "id": test["id"], "sentiment": 
                               -2.5*p_bdv_lr + 5*p_bdv_sgd + .5*p_bdv_nb  } ) #0.97013
output.to_csv('y_bdv.csv', index = False, quoting = 3 )

In [15]:
######################
#ensemble d2v and bow#
######################
p_bow = -1.7*p_bow_lr + 3*p_bow_sgd + .25*p_bow_nb + 1.2*p_bow_ada + .03*p_bow_tree
p_bdv = -2.5*p_bdv_lr + 5*p_bdv_sgd + .5*p_bdv_nb  
output = pd.DataFrame( data = { "id": test["id"], "sentiment": 
                               5.5*p_bow + 1*p_bdv } ) # 0.97037
output.to_csv('y_bow_bdv.csv', index = False, quoting = 3 )

In [16]:
#####################
#####################
##  word2vec       ##
#####################
#####################
model = Word2Vec.load("5000features_4minwords_10context")
n_dim = 5000

X_train_w2v = scale(getAvgFeatureVecs(get_clean_documents(train,'review'), model, n_dim))
X_test_w2v  = scale(getAvgFeatureVecs(get_clean_documents(test ,'review'), model, n_dim))

X_train_bwv = hstack([X_train_bow, X_train_w2v])
X_test_bwv  = hstack([X_test_bow, X_test_w2v])

Text 0 of 25000
Text 20000 of 25000




Text 0 of 25000
Text 20000 of 25000


In [17]:
# ------- logistic ---------
clf = LogisticRegression()
clf.fit(X_train_bwv, train["sentiment"])
p_bwv_lr = clf.predict_proba(X_test_bwv)[:,1]
output = pd.DataFrame( data = { "id": test["id"], "sentiment": p_bwv_lr} )
output.to_csv('y_bwv_lr.csv', index = False, quoting = 3 )
# 0.95714

In [18]:
# -------- try SGD ---------
clf = SGDClassifier(loss='modified_huber', n_iter=100, random_state=0, shuffle=True)
clf.fit(X_train_bwv, train["sentiment"])
p_bwv_sgd = clf.predict_proba(X_test_bwv)[:,1]
output = pd.DataFrame( data = { "id": test["id"], "sentiment": p_bwv_sgd} )
output.to_csv('y_bwv_sgd.csv', index = False, quoting = 3 )

In [19]:
####################
#ensemble w2v model#
####################
output = pd.DataFrame( data = { "id": test["id"], "sentiment": 
                               20.5*p_bwv_lr  -1.5*p_bwv_sgd  } ) #0.95696
output.to_csv('y_bwv.csv', index = False, quoting = 3 )

In [20]:
###########################
#ensemble d2v, w2v and bow#
###########################
p_bwv = 20.5*p_bwv_lr -1.5*p_bwv_sgd   
output = pd.DataFrame( data = { "id": test["id"], "sentiment": 
                               6.5*p_bow - 1.0*p_bdv + .1*p_bwv_sgd } ) #0.97044
output.to_csv('y_bow_bdv_w2v.csv', index = False, quoting = 3 )