In [143]:
import matplotlib
%matplotlib inline
import sys
import tarfile
from pathlib import Path
import matplotlib.pyplot as plt
from IPython.display import SVG, display

import numpy as np
import pandas as pd

import spacy
from spacy import displacy
import textacy
from textacy.extract import ngrams, entities

In [144]:
import spacy
nlp = spacy.load('en_core_web_sm')
# nlp2 = spacy.load('en_core_web_md')
# nlp3 = spacy.load('en_core_web_lg')

In [145]:
type(nlp)

spacy.lang.en.English

In [146]:
nlp.lang

'en'

---
# Spacy
---

In [147]:
sample_text = 'Apple is looking at buying U.K. startup for $1 billion'
doc = nlp(sample_text)

In [24]:
pd.DataFrame([[t.text, t.lemma_, t.pos_, t.tag_, t.dep_, t.shape_,
t.is_alpha, t.is_stop] for t in doc],
columns=['text', 'lemma', 'pos', 'tag', 'dep', 'shape',
'is_alpha', 'is_stop'])

Unnamed: 0,text,lemma,pos,tag,dep,shape,is_alpha,is_stop
0,Apple,Apple,PROPN,NNP,nsubj,Xxxxx,True,False
1,is,be,AUX,VBZ,aux,xx,True,True
2,looking,look,VERB,VBG,ROOT,xxxx,True,False
3,at,at,ADP,IN,prep,xx,True,True
4,buying,buy,VERB,VBG,pcomp,xxxx,True,False
5,U.K.,U.K.,PROPN,NNP,compound,X.X.,False,False
6,startup,startup,NOUN,NN,dobj,xxxx,True,False
7,for,for,ADP,IN,prep,xxx,True,True
8,$,$,SYM,$,quantmod,$,False,False
9,1,1,NUM,CD,compound,d,False,False


### loaing the data

In [28]:
DATA_DIR = Path('..', 'data')

In [29]:
files = (DATA_DIR / 'bbc').glob('**/*.txt')
bbc_articles = []
for i, file in enumerate(sorted(list(files))):
    with file.open(encoding='latin1') as f:
        lines = f.readlines()
        body = ' '.join([l.strip() for l in lines[1:]]).strip()
        bbc_articles.append(body)

In [30]:
len(bbc_articles)

2226

In [31]:

bbc_articles[0]

'Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (Â£600m) for the three months to December, from $639m year-earlier.  The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.  Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL\'s underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to sign up AOL\'s existing customers for high-

## sentences

In [33]:
doc = nlp(bbc_articles[0])
sentences = [s for s in doc.sents]
sentences[:3]

[Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (Â£600m) for the three months to December, from $639m year-earlier.  ,
 The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales.,
 TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn.]

## POS

In [34]:
pd.DataFrame([[t.text, t.pos_, spacy.explain(t.pos_)] for t in sentences[0]], 
             columns=['Token', 'POS Tag', 'Meaning']).head(15)

Unnamed: 0,Token,POS Tag,Meaning
0,Quarterly,ADJ,adjective
1,profits,NOUN,noun
2,at,ADP,adposition
3,US,PROPN,proper noun
4,media,NOUN,noun
5,giant,NOUN,noun
6,TimeWarner,PROPN,proper noun
7,jumped,VERB,verb
8,76,NUM,numeral
9,%,NOUN,noun


## Entity

In [148]:
pd.DataFrame([[t.text, t.ent_type_, spacy.explain(t.ent_type_)] for t in sentences[0]], 
             columns=['Token', 'entity', 'Meaning']).head(30).dropna()

Unnamed: 0,Token,entity,Meaning
0,Quarterly,DATE,Absolute or relative dates or periods
3,US,GPE,"Countries, cities, states"
6,TimeWarner,ORG,"Companies, agencies, institutions, etc."
8,76,PERCENT,"Percentage, including ""%"""
9,%,PERCENT,"Percentage, including ""%"""
12,1.13bn,MONEY,"Monetary values, including unit"
18,the,DATE,Absolute or relative dates or periods
19,three,DATE,Absolute or relative dates or periods
20,months,DATE,Absolute or relative dates or periods
21,to,DATE,Absolute or relative dates or periods


In [39]:
displacy.render(sentences[0].as_doc(), style='ent', jupyter=True)

In [40]:
entities = [e.text for e in entities(doc)]
pd.Series(entities).value_counts().head()

TimeWarner        7
AOL               5
fourth quarter    3
Google            2
US                2
dtype: int64

In [69]:
len(bbc_articles)
str_=" ".join(bbc_articles[0:500])
doc=nlp(str_)

#### using textacy

In [72]:
entities_ = [e.text for e in entities(doc)]
pd.Series(entities_).value_counts().head(10)

US       681
UK       188
2004     178
Yukos    175
China    159
two      143
2005     141
one      129
year     127
India    126
dtype: int64

#### using spacy

In [119]:
df=pd.DataFrame([[t.lemma_, t.ent_type_, spacy.explain(t.ent_type_)] for t in doc], 
             columns=['Token', 'entity','meaning']).dropna()

In [150]:
df.dropna().head(5)

Unnamed: 0,Token,entity,meaning
0,quarterly,DATE,Absolute or relative dates or periods
3,US,GPE,"Countries, cities, states"
6,TimeWarner,ORG,"Companies, agencies, institutions, etc."
8,76,PERCENT,"Percentage, including ""%"""
9,%,PERCENT,"Percentage, including ""%"""


In [151]:
df[(df['entity']!="DATE") & (df['entity']!="PERCENT")
  & (df['entity']!="MONEY")].head(5)

Unnamed: 0,Token,entity,meaning
3,US,GPE,"Countries, cities, states"
6,TimeWarner,ORG,"Companies, agencies, institutions, etc."
39,one,CARDINAL,Numerals that do not fall under another type
45,Google,ORG,"Companies, agencies, institutions, etc."
61,TimeWarner,ORG,"Companies, agencies, institutions, etc."


In [152]:
df[(df['entity']!="DATE") & (df['entity']!="PERCENT")
  & (df['entity']!="MONEY" )].Token.value_counts().head(5)

US       732
the      682
's       220
China    197
Bank     194
Name: Token, dtype: int64

## Ngrams using textacy

In [136]:
pd.Series([n.text for n in ngrams(doc, n=2, min_freq=2)]).value_counts().head()

chief executive    110
stock market        70
economic growth     69
Deutsche Boerse     63
New York            62
dtype: int64

In [137]:
pd.Series([n.text for n in ngrams(doc, n=3, min_freq=2)]).value_counts().head()

Bank of England            35
world's biggest            29
London Stock Exchange      22
told the BBC               21
Securities and Exchange    19
dtype: int64

In [138]:
pd.Series([n.text for n in ngrams(doc, n=4, min_freq=2)]).value_counts().head()

said in a statement                   31
Securities and Exchange Commission    19
fraud and tax evasion                 11
President George W Bush               10
chairman and chief executive           9
dtype: int64

In [139]:
pd.Series([n.text for n in ngrams(doc, n=5, min_freq=2)]).value_counts().head()

political ambitions of its founder    9
slowdown in the housing market        9
ambitions of its founder Mikhail      6
according to the Financial Times      5
hundreds of millions of dollars       5
dtype: int64

---
# NTLK
---

In [156]:

%matplotlib inline
import warnings
from pathlib import Path

import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# spacy, textblob and nltk for language processing
from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer

# sklearn for feature extraction & modeling
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB         # Naive Bayes
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
# from sklearn.externals import joblib
import joblib

In [157]:
path = Path('..', 'data', 'bbc')
files = sorted(list(path.glob('**/*.txt')))
doc_list = []
for i, file in enumerate(files):
    topic = file.parts[-2]
    article = file.read_text(encoding='latin1').split('\n')
    heading = article[0].strip()
    body = ' '.join([l.strip() for l in article[1:]]).strip()
    doc_list.append([topic, heading, body])

In [159]:
docs = pd.DataFrame(doc_list, columns=['topic', 'heading', 'body'])
docs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2226 entries, 0 to 2225
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   topic    2226 non-null   object
 1   heading  2226 non-null   object
 2   body     2226 non-null   object
dtypes: object(3)
memory usage: 52.3+ KB


In [161]:
docs.head()

Unnamed: 0,topic,heading,body
0,business,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarner...
1,business,Dollar gains on Greenspan speech,The dollar has hit its highest level against t...
2,business,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuko...
3,business,High fuel prices hit BA's profits,British Airways has blamed high fuel prices fo...
4,business,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Domec...


In [162]:
article = docs.sample(1).squeeze()

In [163]:
article

topic                                               business
heading                              German economy rebounds
body       Germany's economy, the biggest among the 12 co...
Name: 438, dtype: object

In [164]:
parsed_body = TextBlob(article.body)

### Tokenization

In [165]:
parsed_body.words

WordList(['Germany', "'s", 'economy', 'the', 'biggest', 'among', 'the', '12', 'countries', 'sharing', 'the', 'euro', 'grew', 'at', 'its', 'fastest', 'rate', 'in', 'four', 'years', 'during', '2004', 'driven', 'by', 'strong', 'exports', 'Gross', 'domestic', 'product', 'GDP', 'rose', 'by', '1.7', 'last', 'year', 'the', 'statistical', 'office', 'said', 'The', 'economy', 'contracted', 'in', '2003', 'Foreign', 'sales', 'increased', 'by', '8.2', 'last', 'year', 'compared', 'with', 'a', '0.3', 'slide', 'in', 'private', 'consumption', 'Concerns', 'remain', 'however', 'over', 'the', 'strength', 'of', 'the', 'euro', 'weak', 'domestic', 'demand', 'and', 'a', 'sluggish', 'labour', 'market', 'The', 'European', 'Central', 'Bank', 'ECB', 'left', 'its', 'benchmark', 'interest', 'rate', 'unchanged', 'at', '2', 'on', 'Thursday', 'It', 'is', 'the', 'nineteenth', 'month', 'in', 'a', 'row', 'that', 'the', 'ECB', 'has', 'not', 'moved', 'borrowing', 'costs', 'Economists', 'predict', 'that', 'an', 'increase', 

### sentence

In [167]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ashamsa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [168]:
parsed_body.sentences

[Sentence("Germany's economy, the biggest among the 12 countries sharing the euro, grew at its fastest rate in four years during 2004, driven by strong exports."),
 Sentence("Gross domestic product (GDP) rose by 1.7% last year, the statistical office said."),
 Sentence("The economy contracted in 2003."),
 Sentence("Foreign sales increased by 8.2% last year, compared with a 0.3% slide in private consumption."),
 Sentence("Concerns remain, however, over the strength of the euro, weak domestic demand and a sluggish labour market."),
 Sentence("The European Central Bank (ECB) left its benchmark interest rate unchanged at 2% on Thursday."),
 Sentence("It is the nineteenth month in a row that the ECB has not moved borrowing costs."),
 Sentence("Economists predict that an increase is unlikely to come until the second half of 2005, with growth set to sputter rather than ignite."),
 Sentence(""During 2004 we profited from the fact that the world economy was strong," said Stefan Schilbe, analyst

### stemming

In [170]:
stemmer = SnowballStemmer('english')
[(word, stemmer.stem(word)) for i, word in enumerate(parsed_body.words) 
 if word.lower() != stemmer.stem(parsed_body.words[i])]

[('Germany', 'germani'),
 ('economy', 'economi'),
 ('countries', 'countri'),
 ('sharing', 'share'),
 ('its', 'it'),
 ('years', 'year'),
 ('during', 'dure'),
 ('exports', 'export'),
 ('domestic', 'domest'),
 ('statistical', 'statist'),
 ('office', 'offic'),
 ('economy', 'economi'),
 ('contracted', 'contract'),
 ('sales', 'sale'),
 ('increased', 'increas'),
 ('compared', 'compar'),
 ('private', 'privat'),
 ('consumption', 'consumpt'),
 ('Concerns', 'concern'),
 ('however', 'howev'),
 ('domestic', 'domest'),
 ('its', 'it'),
 ('unchanged', 'unchang'),
 ('moved', 'move'),
 ('borrowing', 'borrow'),
 ('costs', 'cost'),
 ('Economists', 'economist'),
 ('increase', 'increas'),
 ('unlikely', 'unlik'),
 ('ignite', 'ignit'),
 ('During', 'dure'),
 ('profited', 'profit'),
 ('economy', 'economi'),
 ('Schilbe', 'schilb'),
 ('exports', 'export'),
 ('domestic', 'domest'),
 ('remains', 'remain'),
 ('Many', 'mani'),
 ('consumers', 'consum'),
 ('spooked', 'spook'),
 ('unsettled', 'unsettl'),
 ('government',

### Lemmatization

In [171]:
import nltk
nltk.download('wordnet')
[(word, word.lemmatize()) for i, word in enumerate(parsed_body.words) 
 if word != parsed_body.words[i].lemmatize()]

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ashamsa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


[('countries', 'country'),
 ('its', 'it'),
 ('years', 'year'),
 ('exports', 'export'),
 ('sales', 'sale'),
 ('its', 'it'),
 ('has', 'ha'),
 ('costs', 'cost'),
 ('was', 'wa'),
 ('exports', 'export'),
 ('consumers', 'consumer'),
 ('attempts', 'attempt'),
 ('companies', 'company'),
 ('talks', 'talk'),
 ('unions', 'union'),
 ('jobs', 'job'),
 ('costs', 'cost'),
 ('measures', 'measure')]

In [172]:
[(word, word.lemmatize(pos='v')) for i, word in enumerate(parsed_body.words) 
 if word != parsed_body.words[i].lemmatize(pos='v')]

[('sharing', 'share'),
 ('grew', 'grow'),
 ('driven', 'drive'),
 ('exports', 'export'),
 ('rose', 'rise'),
 ('said', 'say'),
 ('contracted', 'contract'),
 ('increased', 'increase'),
 ('compared', 'compare'),
 ('left', 'leave'),
 ('is', 'be'),
 ('has', 'have'),
 ('moved', 'move'),
 ('borrowing', 'borrow'),
 ('costs', 'cost'),
 ('is', 'be'),
 ('profited', 'profit'),
 ('was', 'be'),
 ('said', 'say'),
 ('exports', 'export'),
 ('remains', 'remain'),
 ('been', 'be'),
 ('spooked', 'spook'),
 ('unsettled', 'unsettle'),
 ('attempts', 'attempt'),
 ('companies', 'company'),
 ('including', 'include'),
 ('spent', 'spend'),
 ('talks', 'talk'),
 ('trimming', 'trim'),
 ('jobs', 'job'),
 ('costs', 'cost'),
 ('warned', 'warn'),
 ('are', 'be'),
 ('cutting', 'cut'),
 ('measures', 'measure')]

### sentiment

In [173]:
parsed_body.sentiment

Sentiment(polarity=0.009434865900383142, subjectivity=0.31810344827586207)

In [174]:
parsed_body.sentiment_assessments

Sentiment(polarity=0.009434865900383142, subjectivity=0.31810344827586207, assessments=[(['strong'], 0.4333333333333333, 0.7333333333333333, None), (['gross'], 0.0, 0.0, None), (['domestic'], 0.0, 0.1, None), (['rose'], 0.6, 0.95, None), (['last'], 0.0, 0.06666666666666667, None), (['foreign'], -0.125, 0.125, None), (['last'], 0.0, 0.06666666666666667, None), (['private'], 0.0, 0.375, None), (['weak'], -0.375, 0.625, None), (['domestic'], 0.0, 0.1, None), (['european'], 0.0, 0.0, None), (['central'], 0.0, 0.25, None), (['left'], 0.0, 0.0, None), (['unlikely'], -0.5, 0.5, None), (['second'], 0.0, 0.0, None), (['half'], -0.16666666666666666, 0.16666666666666666, None), (['strong'], 0.4333333333333333, 0.7333333333333333, None), (['domestic'], 0.0, 0.1, None), (['poor'], -0.4, 0.6, None), (['much'], 0.2, 0.2, None), (['many'], 0.5, 0.5, None), (['german'], 0.0, 0.0, None), (['corporate'], 0.0, 0.0, None), (['major'], 0.0625, 0.5, None), (['spent'], -0.1, 0.1, None), (['much'], 0.2, 0.2, N