In [32]:
import pandas as pd
dataset = pd.read_json("data/preprocessed.json")

In [33]:
dataset.shape

(18627, 28)

In [34]:
dataset.head(1)

Unnamed: 0,database,date_only,day_name,description,external_picture,fanpagelink,fb_angry,fb_haha,fb_id,fb_like,...,message,name,num_comments,page_id,preprocessed_name,preprocessed_stem_stop,shares,time_created,time_only,type
10000,facebook_fanpage_buzzfeed,2015-12-19,Saturday,,https://external.xx.fbcdn.net/safe_image.php?d...,https://www.facebook.com/BuzzFeed/posts/101541...,0,0,21898300328_10154172924370329,6591,...,The cold never bothered them anyway.,A Surprise Snowstorm Turned This Wedding Into ...,120,1,a surprise snowstorm turned this wedding into ...,surpris snowstorm turn wed winter wonderland,158,2015-12-19T12:16:01,12:16:01,link


In [35]:
dataset_xy = dataset[["preprocessed_stem_stop","highest_reaction"]]  # Only 2 columns are needed

In [36]:
import os

output_dir = "output/"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
dataset_xy["preprocessed_stem_stop"].to_csv(output_dir + "documents.txt", index=False)

## Feature extraction again, this time using BTM (Biterm Topic Model)

I'm gonna use the titles saved in doc_info.txt

And use the source code from its creator

https://github.com/xiaohuiyan/BTM

And because input docs must be indexed there's 2 files we need to create

`
doc_wids.txt    output docs after indexing, each line is a doc with the format "wordId wordId ..."
voca.txt        output vocabulary file, each line is a word with the format "wordId    word"
`

In [37]:
# translate word into id in documents
import sys

class DocumentIndex():
    
    def __init__(self, output_dir):
        self.output_dir = output_dir
        self.w2id = {}

    def indexFile(self, pt, res_pt):
        print('index file: ', pt)
        wf = open(res_pt, 'w')
        for l in open(pt):
            ws = l.strip().split()
            for w in ws:
                if not w in self.w2id:
                    self.w2id[w] = len(self.w2id)
            wids = [self.w2id[w] for w in ws]        
            wf.write("%s\n" % ' '.join(map(str, wids)))
        print('write file: ', res_pt)

    def write_w2id(self, res_pt):
        print('write:', res_pt)
        wf = open(res_pt, 'w')
        for w, wid in sorted(self.w2id.items(), key=lambda d:d[1]):
            wf.write('%d\t%s\n' % (wid, w))
    
    def create_index(self, doc_pt, dwid_pt, voca_pt):
        self.indexFile(self.output_dir + doc_pt, self.output_dir + dwid_pt)
        print('n(w)=', len(self.w2id))
        self.write_w2id(self.output_dir + voca_pt)

        
    def vocabulary_size(self):
        return len(self.w2id)

In [38]:
output_dir = "output/"
input_vocab = "voca.txt"
input_docs = "doc_wids.txt"

document_index = DocumentIndex(output_dir)
document_index.create_index("documents.txt", input_docs, input_vocab)

index file:  output/documents.txt
write file:  output/doc_wids.txt
n(w)= 11629
write: output/voca.txt


**Parameters**

`
K           int, number of topics
W           int, size of vocabulary
alpha       double, Symmetric Dirichlet prior of P(z), like 1
beta        double, Symmetric Dirichlet prior of P(w|z), like 0.01
n_iter      int, number of iterations of Gibbs sampling
save_step   int, steps to save the results
docs_pt     string, path of training docs
model_dir   string, output directory
`

In [39]:
class BtmFeatureModel():
    
    def __init__(self):
        pass
        
    def make_x(self, model_dir, K, index):
        features_path = model_dir + "k{}.pz_d".format(K)
        features = pd.read_csv(features_path, sep='\s+', header=None)
        features.set_index(index)
        return features
        
    def model_initialization(self, K, W, alpha, beta, niter, save_step,
                             docs_pt, model_dir):
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)

        # TOPIC MODELING
        command_initialize = ("BTM/src/btm est {} {} {} {} {} {} {} {}"). \
                        format(K, W, alpha, beta, niter, save_step,
                               docs_pt, model_dir)
        self.run_command(command_initialize)
        
    def model_inference(self, type_model, K, docs_pt, model_dir):
        # INFERENCE
        command_infer = ("BTM/src/btm inf {} {} {} {}"). \
                            format(type_model, K, docs_pt, model_dir)
        self.run_command(command_infer)
        
    def run_command(self, command):
        print(command)
        try:
            os.system(command)
        except:
            print("exception")
            pass

In [40]:
save_step = 501
input_docs = "doc_wids.txt"
model_dir = output_dir + "model/"
docs_pt = output_dir + input_docs

# Actual parameters
K = 50
W = document_index.vocabulary_size()
alpha = 50 / K
beta = 0.005
niter = 1000

btm_model = BtmFeatureModel()
btm_model.model_initialization(K, W, alpha, beta, niter, save_step,
                               docs_pt, model_dir)

BTM/src/btm est 50 11629 1.0 0.005 1000 501 output/doc_wids.txt output/model/


This just created a K*M matrix for P(w|z)

and a K*1 matrix for P(z)

**Then, P(z|d) is inferred by running the same but using other parameters**

`
K           int, number of topics, like 20
type        string, 4 choices:sum_w, sum_b, lda, mix. sum_b is used in our  paper.
docs_pt     string, path of docs to be inferred
model_dir   string, output directory
`

In [41]:
type_model = 'sum_b'
btm_model.model_inference(type_model, K, docs_pt, model_dir)

BTM/src/btm inf sum_b 50 output/doc_wids.txt output/model/


Now we read that P(z|d) and use it as a features matrix

In [42]:
X = btm_model.make_x(model_dir, K, dataset_xy.index)
X.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,0.000127,0.010011,0.002454,1.2e-05,0.001384,0.001637,8e-05,6.0059e-08,0.0136555,5.83695e-08,...,0.007665,8.39664e-07,0.009744,0.008574,1.19533e-07,2.5e-05,0.218943,9.70775e-08,2.7e-05,2.5e-05
1,0.028168,0.013614,3.2e-05,0.00873,0.014682,0.00423,0.00141,1.85275e-06,2.53104e-08,0.0138072,...,6.7e-05,1.06705e-05,0.004806,0.003968,4.83039e-08,1.5e-05,0.001124,9.59428e-07,0.039269,6e-06


In [79]:
y = dataset_xy["highest_reaction"]
y.head(2)

10000    0
10001    1
Name: highest_reaction, dtype: int64

In [None]:
# def square(x):
#     if x == 0:
#         return 1
#     return x
# y = y.apply(square)

Trying with the Linear SVC

In [None]:
from sklearn import svm

clf_linear_svc = svm.LinearSVC(C=1)

In [None]:
from sklearn import grid_search
from sklearn import cross_validation as cv

param_grid = {
    "C": [ 2**j for j in range(1,4) ]
}

skf = cv.StratifiedKFold(y, n_folds=3)
clf_grid = grid_search.GridSearchCV(clf_linear_svc, 
                                    param_grid = param_grid,
                                    scoring="accuracy",
                                    n_jobs=1,
                                    cv=skf,
                                    verbose=3)
fitted = clf_grid.fit(X, y)

print ("best_params_: {}\n".format(clf_grid.best_params_))
print ("best_score_: {}\n".format(clf_grid.best_score_))

print ("best_score_: {}\n".format(clf_grid.grid_scores_))

## Random Forest

In [46]:
from sklearn.ensemble import RandomForestClassifier
clf_rf = RandomForestClassifier(n_estimators=100)

In [54]:
param_grid = {
    "n_estimators": [ 130, 150 ]
}

skf = cv.StratifiedKFold(y, n_folds=3)
clf_grid = grid_search.GridSearchCV(clf_rf, 
                                    param_grid = param_grid,
                                    scoring="accuracy",
                                    n_jobs=1,
                                    cv=skf,
                                    verbose=3)
fitted = clf_grid.fit(X, y)

print ("best_params_: {}\n".format(clf_grid.best_params_))
print ("best_score_: {}\n".format(clf_grid.best_score_))

print ("best_score_: {}\n".format(clf_grid.grid_scores_))

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV] n_estimators=130 ................................................
[CV] ....................... n_estimators=130, score=0.507165 -  12.0s
[CV] n_estimators=130 ................................................
[CV] ....................... n_estimators=130, score=0.519568 -  11.0s
[CV] n_estimators=130 ................................................
[CV] ....................... n_estimators=130, score=0.548091 -  10.8s
[CV] n_estimators=150 ................................................
[CV] ....................... n_estimators=150, score=0.508614 -  12.4s
[CV] n_estimators=150 ................................................
[CV] ....................... n_estimators=150, score=0.524561 -  12.5s
[CV] n_estimators=150 ................................................
[CV] ....................... n_estimators=150, score=0.545352 -  12.8s


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  1.2min finished


best_params_: {'n_estimators': 150}

best_score_: 0.5261716862618779

best_score_: [mean: 0.52494, std: 0.01713, params: {'n_estimators': 130}, mean: 0.52617, std: 0.01504, params: {'n_estimators': 150}]



In [55]:
from sklearn import svm
clf_rbf = svm.SVC( kernel='rbf')

In [56]:
import numpy as np
from sklearn import grid_search
from sklearn import cross_validation as cv

param_grid = {
    "C": [ 2**j for j in range(1,4) ],
    "gamma": np.logspace(-9, 3, 13)
}

skf = cv.StratifiedKFold(y, n_folds=3)
clf_grid = grid_search.GridSearchCV(clf_rbf, 
                                    param_grid = param_grid,
                                    scoring="accuracy",
                                    n_jobs=1,
                                    cv=skf,
                                    verbose=3)
fitted = clf_grid.fit(X, y)

print ("best_params_: {}\n".format(clf_grid.best_params_))
print ("best_score_: {}\n".format(clf_grid.best_score_))

print ("best_score_: {}\n".format(clf_grid.grid_scores_))

Fitting 3 folds for each of 39 candidates, totalling 117 fits
[CV] C=2, gamma=1e-09 ................................................
[CV] ....................... C=2, gamma=1e-09, score=0.402351 -  18.0s
[CV] C=2, gamma=1e-09 ................................................
[CV] ....................... C=2, gamma=1e-09, score=0.402319 -  19.0s
[CV] C=2, gamma=1e-09 ................................................
[CV] ....................... C=2, gamma=1e-09, score=0.402449 -  18.9s
[CV] C=2, gamma=1e-08 ................................................
[CV] ....................... C=2, gamma=1e-08, score=0.402351 -  19.2s
[CV] C=2, gamma=1e-08 ................................................
[CV] ....................... C=2, gamma=1e-08, score=0.402319 -  21.0s
[CV] C=2, gamma=1e-08 ................................................
[CV] ....................... C=2, gamma=1e-08, score=0.402449 -  21.0s
[CV] C=2, gamma=1e-07 ................................................
[CV] ..........

[Parallel(n_jobs=1)]: Done  31 tasks       | elapsed: 11.3min


[CV] ........................ C=2, gamma=10.0, score=0.508133 -  26.3s
[CV] C=2, gamma=10.0 .................................................
[CV] ........................ C=2, gamma=10.0, score=0.536491 -  28.2s
[CV] C=2, gamma=100.0 ................................................
[CV] ....................... C=2, gamma=100.0, score=0.468846 -  57.6s
[CV] C=2, gamma=100.0 ................................................
[CV] ....................... C=2, gamma=100.0, score=0.481237 -  58.5s
[CV] C=2, gamma=100.0 ................................................
[CV] ....................... C=2, gamma=100.0, score=0.503142 -  54.7s
[CV] C=2, gamma=1000.0 ...............................................
[CV] ...................... C=2, gamma=1000.0, score=0.408308 - 1.0min
[CV] C=2, gamma=1000.0 ...............................................
[CV] ...................... C=2, gamma=1000.0, score=0.414560 - 1.0min
[CV] C=2, gamma=1000.0 ...............................................
[CV] .

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/rgap/anaconda/envs/reactions_proj/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-56-c9730ae5a484>", line 17, in <module>
    fitted = clf_grid.fit(X, y)
  File "/Users/rgap/anaconda/envs/reactions_proj/lib/python3.5/site-packages/sklearn/grid_search.py", line 804, in fit
    return self._fit(X, y, ParameterGrid(self.param_grid))
  File "/Users/rgap/anaconda/envs/reactions_proj/lib/python3.5/site-packages/sklearn/grid_search.py", line 553, in _fit
    for parameters in parameter_iterable
  File "/Users/rgap/anaconda/envs/reactions_proj/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 800, in __call__
    while self.dispatch_one_batch(iterator):
  File "/Users/rgap/anaconda/envs/reactions_proj/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 658, in dispatch_one_batch
    self._d

KeyboardInterrupt: 