In [2]:
!mkdir zips

mkdir: cannot create directory ‘zips’: File exists


In [3]:
!wget https://s3.amazonaws.com/data.patentsview.org/download/g_patent_abstract.tsv.zip




In [4]:
!mv g_patent_abstract.tsv.zip zips

In [5]:
!unzip zips/g_patent_abstract.tsv.zip -d data/

Archive:  zips/g_patent_abstract.tsv.zip
  inflating: data/g_patent_abstract.tsv  


In [7]:
!head -n 3 data/g_patent_abstract.tsv

"patent_id"	"patent_abstract"
"10000000"	"A frequency modulated (coherent) laser detection and ranging system includes a read-out integrated circuit formed with a two-dimensional array of detector elements each including a photosensitive region receiving both return light reflected from a target and light from a local oscillator, and local processing circuitry sampling the output of the photosensitive region four times during each sample period clock cycle to obtain quadrature components. A data bus coupled to one or more outputs of each of the detector elements receives the quadrature components from each of the detector elements for each sample period and serializes the received quadrature components. A processor coupled to the data bus receives the serialized quadrature components and determines an amplitude and a phase for at least one interfering frequency corresponding to interference between the return light and the local oscillator light using the quadrature components."
"10000

In [38]:
!pip install pyspark

Defaulting to user installation because normal site-packages is not writeable


In [37]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, Word2Vec, StopWordsRemover
from pyspark.ml.recommendation import ALS
import numpy as np

In [38]:
spark = SparkSession.builder\
.config("spark.executor.memory", "4g") \
.config("spark.executor.instances", "4") \
.config("spark.executor.cores", "2") \
.config("spark.driver.memory", "4g") \
.getOrCreate()

In [39]:
abstracts = spark.read.format("csv").options(inferSchema = "True", header = "True", sep = "\t" )\
            .load("./data/g_patent_abstract.tsv")

                                                                                

In [40]:
abstracts.show()

+---------+--------------------+
|patent_id|     patent_abstract|
+---------+--------------------+
| 10000000|A frequency modul...|
| 10000001|The injection mol...|
| 10000002|The present inven...|
| 10000003|The invention rel...|
| 10000004|The present inven...|
| 10000005|A vacuum forming ...|
| 10000006|A thermoforming m...|
| 10000007|An expanding tool...|
| 10000008|A decorated strip...|
| 10000009|In sterile, addit...|
| 10000010|3-D printing syst...|
| 10000011|To reduce distort...|
| 10000014|The present inven...|
| 10000015|A hermetically se...|
| 10000016|A film edge seali...|
| 10000017|The invention rel...|
| 10000018|A stretch release...|
| 10000019|An installation a...|
| 10000020|A three-dimension...|
| 10000021|There is provided...|
+---------+--------------------+
only showing top 20 rows



In [41]:
abstract = abstracts.withColumn("patent_id", abstracts["patent_id"].cast("int")).limit(10000)

In [42]:
abstract.select("patent_id").printSchema()

root
 |-- patent_id: integer (nullable = true)



In [43]:
abstract.show()

+---------+--------------------+
|patent_id|     patent_abstract|
+---------+--------------------+
| 10000000|A frequency modul...|
| 10000001|The injection mol...|
| 10000002|The present inven...|
| 10000003|The invention rel...|
| 10000004|The present inven...|
| 10000005|A vacuum forming ...|
| 10000006|A thermoforming m...|
| 10000007|An expanding tool...|
| 10000008|A decorated strip...|
| 10000009|In sterile, addit...|
| 10000010|3-D printing syst...|
| 10000011|To reduce distort...|
| 10000014|The present inven...|
| 10000015|A hermetically se...|
| 10000016|A film edge seali...|
| 10000017|The invention rel...|
| 10000018|A stretch release...|
| 10000019|An installation a...|
| 10000020|A three-dimension...|
| 10000021|There is provided...|
+---------+--------------------+
only showing top 20 rows



In [44]:
abstract.count()

10000

In [45]:
tokenizer = Tokenizer(inputCol="patent_abstract", outputCol="words")
wordsData = tokenizer.transform(abstract.dropna())
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
wordsData = remover.transform(wordsData)

In [46]:
def create_vocab(words_data):
    vocab = defaultdict(lambda: len(vocab))
    for row in words_data.collect():
        for word in row['filtered']:
            vocab[word]
    return dict(vocab)

In [47]:
vocab = create_vocab(wordsData)
reverse_vocab = {index: word for word, index in vocab.items()}

                                                                                

In [48]:
window_size = 10

def create_cooccurrence_data(words_data, vocab, window_size):
    cooccurrence_data = []
    for row in words_data.collect():
        words = row['filtered']
        for i, word in enumerate(words):
            word_id = vocab[word]
            start = max(0, i - window_size)
            end = min(len(words), i + window_size + 1)
            for j in range(start, end):
                if i != j:
                    context_word_id = vocab[words[j]]
                    cooccurrence_data.append((word_id, context_word_id, 1))
    return cooccurrence_data

In [49]:
cooccurrence_data = create_cooccurrence_data(wordsData, vocab, window_size)

# Convert to DataFrame
cooccurrence_df = spark.createDataFrame(cooccurrence_data, ["word_id", "context_word_id", "count"])

In [50]:
als = ALS(
    maxIter=15,
    regParam=0.01,
    rank=50,
    userCol="word_id",
    itemCol="context_word_id",
    ratingCol="count",
    implicitPrefs=True
)

In [51]:
model = als.fit(cooccurrence_df)

24/07/19 19:00:07 WARN TaskSetManager: Stage 725 contains a task of very large size (14177 KiB). The maximum recommended task size is 1000 KiB.
24/07/19 19:00:11 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 725 (TID 989): Attempting to kill Python Worker
24/07/19 19:00:11 WARN TaskSetManager: Stage 726 contains a task of very large size (14177 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [52]:
word_factors = model.userFactors
context_factors = model.itemFactors

In [53]:
word_embeddings = {row.id: row.features for row in word_factors.collect()}
context_embeddings = {row.id: row.features for row in context_factors.collect()}

In [54]:
wordsData.printSchema()

root
 |-- patent_id: integer (nullable = true)
 |-- patent_abstract: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- filtered: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [57]:
word2Vec = Word2Vec(vectorSize=10, minCount=0, inputCol="words", outputCol="result")
word2VecModel = word2Vec.fit(wordsData.filter(wordsData["words"].isNotNull()))

                                                                                

In [58]:
word2VecEmbeddings = word2VecModel.getVectors()

In [59]:
def find_similar_words(word, embeddings, vocab, reverse_vocab, top_n=5):
    word_id = vocab.get(word, None)
    if word_id is None:
        return []

    word_vector = np.array(embeddings[word_id])
    similarities = []
    
    for other_word_id, other_word_vector in embeddings.items():
        if other_word_id != word_id:
            other_word_vector = np.array(other_word_vector)
            similarity = np.dot(word_vector, other_word_vector) / (np.linalg.norm(word_vector) * np.linalg.norm(other_word_vector))
            similarities.append((reverse_vocab[other_word_id], similarity))
    
    similarities.sort(key=lambda x: -x[1])
    return similarities[:top_n]

In [60]:
query_word = "nucleus"

In [61]:
similar_words_als = find_similar_words(query_word, word_embeddings, vocab, reverse_vocab)

In [62]:
similar_words_w2v = word2VecModel.findSynonyms(query_word, 5).collect()

In [63]:
print(f"Similar words to '{query_word}' using ALS:")
for word, similarity in similar_words_als:
    print(f"{word}: {similarity}")

Similar words to 'nucleus' using ALS:
searching;: 0.8494426914706388
pooled-data: 0.8428253686397938
number”: 0.8325180771152887
points-of-interest: 0.8321896405497218
“selection: 0.8296246209389496


In [64]:
print(f"\nSimilar words to '{query_word}' using Word2Vec:")
for row in similar_words_w2v:
    print(f"{row['word']}: {row['similarity']}")


Similar words to 'nucleus' using Word2Vec:
spint2,: 0.9273282885551453
(n): 0.9157333374023438
force-fit: 0.9087634086608887
humidity,: 0.8863179087638855
demonstrated: 0.8819639682769775
