In [37]:
from google.cloud import bigquery
client = bigquery.Client() 
# Note I have set up GOOGLE_APPLICATION_CREDENTIALS and GOOGLE_CLOUD_PROJECT environment variables
# per https://google-cloud-python.readthedocs.io/en/latest/bigquery/usage.html#id3

In [38]:
QUERY = """SELECT 
              source, 
              REGEXP_REPLACE(title, '[^a-zA-Z0-9 $.-]', ' ') AS title, 
              time_ts, 
              id
           FROM
            (SELECT
              ARRAY_REVERSE(SPLIT(REGEXP_EXTRACT(url, '.*://(.[^/]+)/'), '.'))[OFFSET(1)] AS source,
              title, 
              time_ts, 
              id
             FROM
              `bigquery-public-data.hacker_news.stories`
             WHERE
              REGEXP_CONTAINS(REGEXP_EXTRACT(url, '.*://(.[^/]+)/'), '.com$')
              AND LENGTH(title) > 10
            )
           WHERE (source = 'techcrunch' OR source = 'nytimes') AND time_ts > '2015-01-01'
           """

In [39]:
import pandas as pd
import pandas_gbq

PROJECT_ID='sgreenberg-project2'

df = pandas_gbq.read_gbq(project_id=PROJECT_ID, dialect='standard', query=QUERY)

# Adding a space between words

**Need Hayato's help to create some Japanese text that will show off the NL tokenization.**

In [3]:
#from google.cloud import language
#nl_client = language.Client()

#def wakachi(msg):
#    document = nl_client.document_from_text(msg)
#    response = document.analyze_syntax()
#    tx_wakachi = " ".join([tkn.text_content for tkn in response.tokens])
#    return tx_wakachi

#df['msg_wakachi'] = df.message.apply(wakachi)

In [41]:
df.head()

Unnamed: 0,source,title,time_ts,id
0,nytimes,4chan Message Board Sold to Founder of 2Chann...,2015-09-21 22:41:41,10255585
1,nytimes,Big Price Increase for Tuberculosis Drug Is Re...,2015-09-21 22:43:57,10255601
2,nytimes,Stop Googling. Let s Talk,2015-09-29 04:50:13,10294707
3,nytimes,NYC Mayor to require computer science in city ...,2015-09-15 23:29:44,10223767
4,nytimes,Rethinking Work,2015-09-01 05:30:35,10150884


# Vectorize all sentences with TfidfVectorizer

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
import numpy as np

vectorizer = TfidfVectorizer(sublinear_tf=True)
features = vectorizer.fit_transform(df.title)

labels = pd.get_dummies(df.source)['nytimes'].as_matrix()

  


# Split dataset into train and test

In [43]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.1, random_state=42)

# Classify test data with Linear SVM

In [44]:
clf = LinearSVC(penalty='l2', dual=False, tol=1e-3)

clf.fit(X_train, y_train)
pred = clf.predict(X_test)
clf.score(X_test, y_test)

0.8441215323645971

# Let's classify the category of sentence; NYTimes
Correct label is 1

In [45]:
txt_nyt = u'New York State Governor announces new educational vision'

feature_nyt = vectorizer.transform([txt_nyt])
clf.predict(feature_nyt)

array([1], dtype=uint8)

# Let's classify the category of sentence ; TechCrunch
Correct label is 0

In [46]:
txt_techcrunch = u'YouTube Launches new channel service in partnership with Google Photos'

feature_techcrunch = vectorizer.transform([txt_techcrunch])
clf.predict(feature_techcrunch)

array([0], dtype=uint8)