**This notebook is following the video tutorials of spaCy made by spaCy's developers.**

**Tutorial's main purpose is detecting programming languages in the stackoverflow data.**

![](http://)**Here are the links for the [first](https://youtu.be/WnGPv6HnBok) and [second](https://youtu.be/KL4-Mpgbahw) video of the tutorial.**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("../input/stacksample/Questions.csv", nrows=1_000_000,
                encoding="ISO-8859-1", usecols=['Title', 'Id'])
titles = [_ for _ in df['Title']]
df.head()


In [None]:
import random

random.choices(titles, k=20)

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [None]:
doc = nlp("Show all sensors.")
token = doc[0]

In [None]:
# after . press tab to see properties of the object
token.is_stop

In [None]:
from spacy import displacy

# render a model for our document
displacy.render(doc)

In [None]:
# what is det or advmod in the graph?
spacy.explain("det")
# spacy.explain("advmod")

In [None]:
# we can also print what displacy.render shows us
for token in doc:
    print(token, token.pos_, token.dep_)

In [None]:
# now lets try to find appearances of "go" in the titles

nlp = spacy.load("en_core_web_sm")
df = (pd.read_csv("../input/stacksample/Questions.csv", nrows=2_000_000, 
                  encoding="ISO-8859-1", usecols=['Title', 'Id']))

titles = [_ for _ in df.loc[lambda d: d['Title'].str.lower().str.contains("go")]['Title']]

# we disabled some pipes to get a better performance
nlp = spacy.load("en_core_web_sm", disable=["ner"])

In [None]:
%%time

def has_golang(doc):
    for t in doc:
        if t.lower_ in ["go", "golang"]:
            if t.pos_ == "NOUN":
                return True 
    return False

g = (doc for doc in nlp.pipe(titles) if has_golang(doc))
[next(g) for i in range(30)]

In [None]:
df_tags = pd.read_csv("../input/stacksample/Tags.csv")
go_ids = df_tags.loc[lambda d: d['Tag'] == 'go']['Id']

def has_go_token(doc):
    for t in doc:
        if t.lower_ in ['go', 'golang']:
            if t.pos_ != 'VERB':
                return True
    return False

all_go_sentences = df.loc[lambda d: d['Id'].isin(go_ids)]['Title'].tolist()
detectable = [d.text for d in nlp.pipe(all_go_sentences) if has_go_token(d)]

non_detectable = (df
                  .loc[lambda d: ~d['Id'].isin(go_ids)]
                  .loc[lambda d: d['Title'].str.lower().str.contains("go")]
                  ['Title']
                  .tolist())

non_detectable = [d.text for d in nlp.pipe(non_detectable) if has_go_token(d)]

len(all_go_sentences), len(detectable), len(non_detectable)

In [None]:
model_name = "en_core_web_sm"
model = spacy.load(model_name, disable=["ner"])

def has_go_token(doc):
    for t in doc:
        if t.lower_ in ["go", "golang"]:
            if t.pos_ != "VERB":
                return True
    return False

method = "not-verb-but-pobj"

correct = sum(has_go_token(doc) for doc in model.pipe(detectable))
wrong = sum(has_go_token(doc) for doc in model.pipe(non_detectable))
precision = correct/(correct + wrong)
recall = correct/len(detectable)
accuracy = (correct + len(non_detectable) - wrong)/(len(detectable) + len(non_detectable))

f"{precision},{recall},{accuracy},{model_name},{method}" # this is logged

In [None]:
# 2nd part of the tutorial
# now we can detect go as a programming language, lets find other languages as well

nlp = spacy.load("en_core_web_sm")

def has_go_token(doc):
    for t in doc:
        if t.lower_ in ['go', 'golang', 'python', 'ruby', 'objective-c']:
            if t.pos_ != 'VERB':
                return True
    return False

In [None]:
doc = nlp("i am an iOS dev and I like to code in objective-c")

# the problem is objective-c since it is composed of three tokens: objective,-,c
# to find the pattern we will use a matcher 
[t for t in doc]

In [None]:
from spacy.matcher import Matcher

# objective-c, objective c
obj_c_pattern1 = [{'LOWER': 'objective'},
                  {'IS_PUNCT': True, 'OP': '?'},
                  {'LOWER': 'c'}]

obj_c_pattern2 = [{'LOWER': 'objectivec'}]

golang_pattern1 = [{'LOWER': 'golang'}] 
golang_pattern2 = [{'LOWER': 'go', 
                    'POS': {'NOT_IN': ['VERB']}}]

python_pattern = [{'LOWER': 'python'}]
ruby_pattern   = [{'LOWER': 'ruby'}]
js_pattern     = [{'LOWER': {'IN': ['js', 'javascript']}}]

In [None]:
matcher = Matcher(nlp.vocab, validate=True)
matcher.add("OBJ_C_LANG", None, obj_c_pattern1, obj_c_pattern2)

In [None]:
matcher(doc) # return matchid, match start index, match end index
doc[11:14]

In [None]:
matcher.add("PYTHON_LANG", None, python_pattern)
matcher.add("GO_LANG", None, golang_pattern1, golang_pattern2)
matcher.add("JS_LANG", None, js_pattern)
matcher.add("RUBY_LANG", None, ruby_pattern)

In [None]:
doc = nlp("I am an iOS dev who codes in both python, go/golang as well as objective-c")
for match_id, start, end in matcher(doc):
    print(doc[start: end])

In [None]:
# takes go as verb so this is a problem
doc = nlp("I've done some js and ruby and go programming")
for match_id, start, end in matcher(doc):
    print(doc[start: end])

In [None]:
# we can do benchmarking

titles = (_ for _ in df['Title'] if "python" in _.lower())

for i in range(200):
    doc = nlp(next(titles))
    if len(matcher(doc)) == 0:
        print(doc)