In [1]:
import pandas as pd

df = pd.read_csv("data/Questions.csv", nrows=1_000_000, encoding="ISO-8859-1", usecols=["Title", "Id"])


titles = df["Title"].copy()

titles

0         SQLStatement.execute() - multiple queries in o...
1         Good branching and merging tutorials for Torto...
2                                         ASP.NET Site Maps
3                        Function for creating color wheels
4         Adding scripting functionality to .NET applica...
                                ...                        
999995    GStreamer tutorial on Android getting 'Unsuppo...
999996                I am getting an undefined index error
999997    Bootstrapping to estimate the mean of a geomet...
999998                                Ruby: HTTP Put method
999999                          Call Functions in CakePHP 3
Name: Title, Length: 1000000, dtype: object

In [2]:
import random

random.choices(titles, k=20)

['Canceling Threads',
 'Hide a Collection_Select Option',
 'Choose path of folder using HTML or JavaScript',
 'Passing data to a method when a button is tapped',
 'COUNTIFS Statement Multiple Tabs in Google Spreadsheet',
 'Simulating file system access',
 'Boost.Log to file and stdout simultaneously?',
 'What mutation-testing frameworks exist for C/C++?',
 'JAXB binding XBRL element does not work',
 'css table center not working in Firefox',
 'golang: get current scope of function name',
 'Set Tableview background with Image and Label',
 'Sharing config files & database connection strings across enterprise',
 'flex dynamic radio button,how to get the value of selected radio button',
 'Jquery mobile page shows previously viewed data',
 'GtkListStore - how to center text?',
 'How do you add a button to a CGridView?',
 'hyperic jmx monitoring: plugin does not exist in HQ',
 'Compare two Values across two Tables',
 'Is resetting the array a valid way to break from Array.forEach?']

In [3]:
def has_golang(text: str):
    return " go " in text

g = (title for title in titles if has_golang(title))

In [4]:
[next(g) for i in range(2)]

['Where does Console.WriteLine go in ASP.NET?',
 'Should try...catch go inside or outside a loop?']

In [5]:
import en_core_web_sm

en_core_web_sm.load()

<spacy.lang.en.English at 0x106789240>

In [6]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [10]:
[token for token in nlp("My name is Shun.")]

[My, name, is, Shun, .]

In [11]:
doc = nlp("My name is Shun.")

In [12]:
token = doc[0]
token

My

In [15]:
token.lemma

561228191312463089

In [16]:
from spacy import displacy

displacy.render(doc)

In [24]:
for t in doc:
    print(t, t.pos_, t.dep_)

My DET poss
name NOUN nsubj
is VERB ROOT
Shun PROPN attr
. PUNCT punct


In [21]:
spacy.explain("poss")

'possession modifier'

In [26]:
for t in nlp("Where does Console.WriteLine go in ASP.NET?"):
    print(t, t.pos_, t.dep_)

Where ADV advmod
does VERB ROOT
Console PROPN nsubj
. PUNCT punct
WriteLine PROPN nsubj
go VERB ROOT
in ADP prep
ASP.NET PROPN pobj
? PUNCT punct


In [29]:
df = pd.read_csv("data/Questions.csv", nrows=2_000_000, encoding="ISO-8859-1", usecols=["Title", "Id"])

titles = df.loc[lambda d: d["Title"].str.lower().str.contains("go")]["Title"].copy()

titles

1          Good branching and merging tutorials for Torto...
27                               Good STL-like library for C
35                My website got hacked... What should I do?
53                   DVCS Choices - What's good for Windows?
57         Is a "Confirm Email" input good practice when ...
                                 ...                        
1264130        Google & Yandex Crawling but site not opening
1264162    Is it idiomatic in go to handle all returned e...
1264181    d3 chart and google font not visible on github...
1264192    Google Apps Script (Calendar) - Compare if 2 e...
1264196    Django Internal Server 500 Error AWS Elastic-B...
Name: Title, Length: 49589, dtype: object

In [43]:
def has_golang(text: str):
    doc = nlp(text)
    for token in doc:
        if token.lower_ in ["go", "golang"]:
            if token.pos_ != "VERB":
                return True
    return False
g = (title for title in titles if has_golang(title))
[next(g) for i in range(25)]

['Deploying multiple Java web apps to Glassfish in one go',
 'Removing all event handlers in one go',
 'How to Create a Dropdown List Hyperlink without the GO button?',
 'How do I disable multiple listboxes in one go using jQuery?',
 'Embedding instead of inheritance in Go',
 'Shared library in Go?',
 'multi package makefile example for go',
 "What's the point of having pointers in Go?",
 'Simulate a tcp connection in Go',
 '<canvas> Go/Baduk/Weiqi Game Board',
 "How to listen for iPhone keyboard action/touch (ex, 'GO', 'Search', etc)",
 'SOAPUI & Groovy Scripts, executing multiple SQL statements in one go',
 "What's the simplest way to edit conflicted files in one go when using git and an editor like Vim or textmate?",
 'Import large chunk of data into Google App Engine Data Store at one go',
 'How many records can be loaded into Salesforce using Apex Data Loader in one go?',
 'How can I run multiple inserts with NHibernate in one go?',
 'Convert string to integer type in Go?',
 'Inst

In [44]:
displacy.render(nlp("How to Create a Dropdown List Hyperlink without the GO button"))

In [38]:
displacy.render(nlp("Golang: http server leaving open goroutines"))

In [39]:
displacy.render(nlp("Embedding instead of inheritance in Go"))

In [40]:
spacy.explain("pobj")

'object of preposition'

In [41]:
displacy.render(nlp("Removing all handlers in one go"))

In [54]:
%%time

def has_golang(text: str):
    doc = nlp(text)
    for token in doc:
        if token.lower_ in ["go", "golang"] and token.pos_ != "VERB" and token.dep_ == "pobj":
            return True
    return False
    
g = (title for title in titles if has_golang(title))
[next(g) for i in range(5)]

CPU times: user 13.1 s, sys: 214 ms, total: 13.3 s
Wall time: 15.5 s


['Embedding instead of inheritance in Go',
 'Shared library in Go?',
 'multi package makefile example for go',
 "What's the point of having pointers in Go?",
 'Simulate a tcp connection in Go']

In [51]:
one_go = nlp("How can I run multiple inserts with NHibernate in one go")

for token in one_go:
    print(token, token.pos_, token.dep_)

displacy.render(one_go)

How ADV advmod
can VERB aux
I PRON nsubj
run VERB ROOT
multiple ADJ amod
inserts NOUN dobj
with ADP prep
NHibernate PROPN pobj
in ADP prep
one NUM pobj
go NOUN advmod


In [57]:
%%time

def has_golang(doc):
    for token in doc:
        if token.lower_ in ["go", "golang"] and token.pos_ != "VERB" and token.dep_ == "pobj":
            return True
    return False
    
g = (doc for doc in nlp.pipe(titles) if has_golang(doc))
[next(g) for i in range(5)]

CPU times: user 3.1 s, sys: 1.98 s, total: 5.08 s
Wall time: 7.01 s


[Embedding instead of inheritance in Go,
 Shared library in Go?,
 multi package makefile example for go,
 What's the point of having pointers in Go?,
 Simulate a tcp connection in Go]

In [60]:
df_tags = pd.read_csv("data/Tags.csv")
go_ids = df_tags.loc[lambda d: d["Tag"] == "go"]["Id"].copy()

go_ids

98267       1724680
98367       1726130
98457       1727250
100482      1757090
101172      1766720
             ...   
3746985    40110670
3747206    40112250
3748186    40120850
3750374    40138660
3750837    40142060
Name: Id, Length: 1858, dtype: int64

In [61]:
def has_go_token(doc):
    for token in doc:
        if token.lower_ in ["go", "golang"]:
            return True
    return False

In [62]:
all_go_sentences = df.loc[lambda d: d["Id"].isin(go_ids)]["Title"].tolist()

detectable = [d.text for d in nlp.pipe(all_go_sentences) if has_go_token(d)]

non_dectable = (
    df
    .loc[lambda d: ~d["Id"].isin(go_ids)]
    .loc[lambda d: d["Title"].str.lower().str.contains("go")]["Title"]
    .tolist()
)

non_detectable = [d.text for d in nlp.pipe(non_dectable) if has_go_token(d)]

len(all_go_sentences), len(detectable), len(non_detectable)

(1858, 1208, 1696)

In [65]:
model_name = "en_core_web_sm"
model = spacy.load(model_name, disable=["ner"])

def has_go_token(doc):
    for token in doc:
        if token.lower_ in ["go", "golang"] and token.pos_ != "VERB" and token.dep_ == "pobj":
            return True
    return False
    
method = "not-verb-but-pboj"

correct = sum(has_go_token(doc) for doc in model.pipe(detectable))
wrong = sum(has_go_token(doc) for doc in model.pipe(non_detectable))
precision = correct / (correct + wrong)
recall = correct / len(detectable)
accuracy = (correct + len(non_detectable) - wrong) / (len(detectable) + len(non_detectable))

f"precision: {precision}, recall: {recall}, accuracy: {accuracy}, model_name: {model_name}, method: {method}"

'precision: 0.9465811965811965, recall: 0.3667218543046358, accuracy: 0.727961432506887, model_name: en_core_web_sm, method: not-verb-but-pboj'