In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gc
import os
import pickle
import sklearn
import sys

from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

from tqdm import *

%matplotlib inline
%load_ext autoreload
%autoreload 1

In [None]:
src_dir = os.path.join(os.getcwd(), os.pardir, '../src')
sys.path.append(src_dir)

In [None]:
%aimport data.delicious_t140
%aimport features.build_features
%aimport helpers.files,helpers.labels

In [None]:
from data.delicious_t140 import load_or_get_from_cache
from helpers.labels import truncate_labels
from features.delicious_t140 import clean_text_delicious

In [None]:
ROOT = "/media/felipe/SAMSUNG/delicious/delicioust140"
TAGINFO = ROOT+"/taginfo.xml"
INTERIM_DATA_ROOT = os.path.abspath("../../data/interim/delicious-t140/")
MAX_NB_WORDS = 2000
MIN_LABEL_DF = 2

In [None]:
if os.path.isfile(INTERIM_DATA_ROOT+"/docs_df.p"):
    docs_df = pickle.load(open(INTERIM_DATA_ROOT+"/docs_df.p", "rb" ))
else:
    docs_df = load_taginfo_into_dataframe(TAGINFO)
    pickle.dump(docs_df,open(INTERIM_DATA_ROOT+"docs_df.p","wb"))

In [None]:
num_documents = len(docs_df)
num_documents

In [None]:
docs_df.head(10)

In [None]:
# TODO optimize this because currently this does one I/O OP per loop
def load_contents(hash):
    file_path = ROOT+"fdocuments/"+get_directory_name_from_hash(hash)+"/"+hash+".html"
       
    with open(file_path,"r",encoding='utf-8', errors='ignore') as f:
        contents = f.read()
        
    return contents

In [None]:
%%time

if os.path.isfile(INTERIM_DATA_ROOT+"/sample_df.p"):
    sample_df = pickle.load(open(INTERIM_DATA_ROOT+"/sample_df.p", "rb"))
else:
    random_indices = np.random.choice(docs_df.index.values, int(num_documents/50), replace=False)
    sample_df = docs_df.loc[random_indices]
    sample_df = sample_df.reset_index().drop(['index'],axis=1)
    sample_df['contents'] = sample_df['hash'].map(lambda hash: load_contents(hash))
    pickle.dump(sample_df,open(INTERIM_DATA_ROOT+"/sample_df.p","wb"))

In [None]:
num_documents = len(sample_df)
num_documents

In [None]:
sample_df.head(10)

In [None]:
tag_sets = sample_df["unique_tags"].values

all_tags = set()

for tag_set in tag_sets:
    for tag in tag_set.split(','):
        all_tags.add(tag)

In [None]:
len(all_tags)

In [None]:
sample_df["tags_split"] = sample_df["unique_tags"].map(lambda tagstring: tagstring.split(","))
labels = sample_df["tags_split"].values

In [None]:
truncated_labels = truncate_labels(labels,MIN_LABEL_DF)

In [None]:
mlb = MultiLabelBinarizer()
binary_label_data = mlb.fit_transform(truncated_labels)

In [None]:
binary_label_data.shape

In [None]:
data = sample_df["contents"].values

In [None]:
data[0][:1000]

In [None]:
os.environ["CLASSPATH"]="/home/felipe/auto-tagger/data/stanford-postagger/stanford-postagger-2017-06-09"
clean_text_delicious(data[0])[:1000]

In [None]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LinearSVC(),n_jobs=-1)),
])

In [None]:
# use later
parameters = {
    'vect__preprocessor': clean_text_delicious,
    "vect__max_features": MAX_NB_WORDS
}

In [None]:
scores = cross_val_score(pipeline, data, binary_label_data, cv=5,scoring='f1_micro',verbose=0)

In [None]:
scores