In [1]:
import pickle
from gensim.models import KeyedVectors
word2vec = KeyedVectors.load_word2vec_format('tr_word2vec', binary=True)



In [3]:
with open("word2vec.pickle", "wb+") as w:
    pickle.dump(word2vec, w)

In [51]:
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions, GoogleCloudOptions, StandardOptions, SetupOptions
from apache_beam.io.gcp.datastore.v1.datastoreio import ReadFromDatastore
from apache_beam.metrics import Metrics
from apache_beam.metrics.metric import MetricsFilter
from google.cloud.proto.datastore.v1 import entity_pb2
from google.cloud.proto.datastore.v1 import query_pb2
from googledatastore import helper as datastore_helper
from googledatastore import PropertyFilter
from google.cloud import datastore
from google.cloud.proto.datastore.v1 import query_pb2
import random

In [38]:
import nltk.data
from nltk import wordpunct_tokenize
from nltk.tokenize import WordPunctTokenizer

_sentence_tokenizer = nltk.data.load("./tokenizer/punkt_turkish.pickle")
word_tokenizer = WordPunctTokenizer()
abbreviations = set()
with open("./tokenizer/abbreviations-long.txt") as f:
    for l in f:
        abbreviations.add(l.split(':')[0])

_sentence_tokenizer._params.abbrev_types = abbreviations


def sentences_from_text(self, text):
    return _sentence_tokenizer.tokenize(text.strip())

In [None]:
class SentenceTokenizerDoFn(beam.DoFn):
    """Convert a news story into sentences."""
        
    def process(self, element):
        """Returns an iterator over words in contents of Cloud Datastore entity.
        The element is a line of text.  If the line is blank, note that, too.
        Args:
          element: the input element to be processed
        Returns:
          The processed element.
        """
        tokenizer = Tokenizer()
        sentences = tokenizer.sentences_from_text(element["content"])

        return sentences

In [None]:

class ObjConverter(beam.DoFn):
    """Convert a news story into sentences."""
        
    def process(self, element):
        """Returns an iterator over words in contents of Cloud Datastore entity.
        The element is a line of text.  If the line is blank, note that, too.
        Args:
          element: the input element to be processed
        Returns:
          The processed element.
        """

        link = element.properties.get('link', None)
        link = link.string_value if link else ""
        
        title = element.properties.get('title', None)
        title = title.string_value if title else ""
        
        description = element.properties.get("description", None)
        link = description.string_value if description else ""
        
        content = element.properties.get("text", "")
        content = content.string_value if content else ""
        
        published = element.properties.get("published")
        published = published.string_value if published else ""


        return {
            "link": link,
            "title": title,
            "description": description,
            "content": content,
            "published": published
        }

In [56]:
options = PipelineOptions()
google_cloud_options = options.view_as(GoogleCloudOptions)
google_cloud_options.project = 'news-197916'
google_cloud_options.job_name = 'sso' + str(random.randint(1, 9999))
google_cloud_options.staging_location = 'gs://news-197916.appspot.com/word_count/'
google_cloud_options.temp_location = 'gs://news-197916.appspot.com/df_tmp'
options.view_as(StandardOptions).runner = 'DataflowRunner'

setup_options = options.view_as(SetupOptions)
setup_options.requirements_file = "requirements.txt"
# setup_options.save_main_session = True

p = beam.Pipeline(options=options)

In [57]:

query = query_pb2.Query()
query.kind.add().name = "News_Entry"

objs = (p
    | 'Read From Datastore' >> ReadFromDatastore(project = google_cloud_options.project, query=query)
    | 'Convert to Object' >> beam.ParDo(ObjConverter())
       )

sentences = (objs
    | 'Sentence Tokenization' >> beam.FlatMap(lambda x: sentences_from_text(x.content))
            )

results = (sentences
    | 'Write Results' >> beam.io.WriteToText("gs://news-197916.appspot.com/sents.txt")
          )



In [58]:
result = p.run()
# result.wait_until_finish()
result

<DataflowPipelineResult <Job
 createTime: u'2018-05-03T12:21:15.830228Z'
 currentStateTime: u'1970-01-01T00:00:00Z'
 id: u'2018-05-03_05_21_15-192254392226200960'
 location: u'us-central1'
 name: u'sso9484'
 projectId: u'news-197916'
 stageStates: []
 steps: []
 tempFiles: []
 type: TypeValueValuesEnum(JOB_TYPE_BATCH, 1)> at 0xcc2c1d0L>

In [59]:
p

<apache_beam.pipeline.Pipeline at 0xd2e0fd0>