## ML APIs as preprocessors for scikit-learn

#### Load Dependencies

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

from google.cloud import bigquery
from google.cloud import language
from google.cloud.language import types
from google.cloud.language import enums

import pandas as pd

client = bigquery.Client() 
%reload_ext google.cloud.bigquery

pd.set_option("max_r",6)

#### Load results from BigQuery into a pandas DataFrame

In [None]:
%%bigquery df
SELECT source, title FROM 
`sgreenberg-project2.misc_ml.hacker_news_stories`
ORDER BY id

#### First attempt to tokenize

In [None]:
vectorizer = TfidfVectorizer(sublinear_tf=True)

vocab = vectorizer.fit(df.title).vocabulary_
vocab_df = pd.DataFrame(vocab.items())
vocab_df

#### Use Natural Language API to tokenize

In [None]:
client = language.LanguageServiceClient()

def tokenize(title):
    document = types.Document(content=title, 
                              type=enums.Document.Type.PLAIN_TEXT)
    tokens = client.analyze_syntax(document).tokens
    tokenized_text = " ".join([t.text.content for t in tokens])
    return tokenized_text

df['tokenized_title'] = df.title.apply(tokenize)

#### Second attempt to tokenize

In [None]:
vocab = vectorizer.fit(df.tokenized_title).vocabulary_
vocab_df = pd.DataFrame(vocab.items())
vocab_df