## Importing Relavant Packages

In [98]:
import numpy as np
import pandas as pd
from pandas import DataFrame
import re
import itertools

import spacy
import en_core_web_sm

nlp = en_core_web_sm.load()

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ramya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### We had several text files as articles in a specific path, so each textfile can be pushed as single entity in a dataframe

In [99]:
import os

directory = r'C:\Users\ramya\Downloads\bbc-fulltext\bbc\business'

num=0
business={}
for filename in os.listdir(directory):
    if filename.endswith(".txt") :
        num=num+1
        x_file = open(os.path.join(directory, filename), "r")

        words=x_file.read()
        business[num]=words   

### Example- A typical article

In [100]:
print (business[4])

High fuel prices hit BA's profits

British Airways has blamed high fuel prices for a 40% drop in profits.

Reporting its results for the three months to 31 December 2004, the airline made a pre-tax profit of Â£75m ($141m) compared with Â£125m a year earlier. Rod Eddington, BA's chief executive, said the results were "respectable" in a third quarter when fuel costs rose by Â£106m or 47.3%. BA's profits were still better than market expectation of Â£59m, and it expects a rise in full-year revenues.

To help offset the increased price of aviation fuel, BA last year introduced a fuel surcharge for passengers.

In October, it increased this from Â£6 to Â£10 one-way for all long-haul flights, while the short-haul surcharge was raised from Â£2.50 to Â£4 a leg. Yet aviation analyst Mike Powell of Dresdner Kleinwort Wasserstein says BA's estimated annual surcharge revenues - Â£160m - will still be way short of its additional fuel costs - a predicted extra Â£250m. Turnover for the quarter was up

In [101]:
pd.set_option('display.min_rows', 500)

In [102]:
df = DataFrame(list(business.items()),columns = ['filename','article'])
df.set_index("filename",inplace=True)

# Cleaning

### Removing Punctuations and Symbols

In [103]:
df["article"]=df["article"].apply(lambda x: re.sub("\W+"," ",x))

In [104]:
df["article"][4]

'High fuel prices hit BA s profits British Airways has blamed high fuel prices for a 40 drop in profits Reporting its results for the three months to 31 December 2004 the airline made a pre tax profit of Â 75m 141m compared with Â 125m a year earlier Rod Eddington BA s chief executive said the results were respectable in a third quarter when fuel costs rose by Â 106m or 47 3 BA s profits were still better than market expectation of Â 59m and it expects a rise in full year revenues To help offset the increased price of aviation fuel BA last year introduced a fuel surcharge for passengers In October it increased this from Â 6 to Â 10 one way for all long haul flights while the short haul surcharge was raised from Â 2 50 to Â 4 a leg Yet aviation analyst Mike Powell of Dresdner Kleinwort Wasserstein says BA s estimated annual surcharge revenues Â 160m will still be way short of its additional fuel costs a predicted extra Â 250m Turnover for the quarter was up 4 3 to Â 1 97bn further benef

In [105]:
df.head()

Unnamed: 0_level_0,article
filename,Unnamed: 1_level_1
1,Ad sales boost Time Warner profit Quarterly pr...
2,Dollar gains on Greenspan speech The dollar ha...
3,Yukos unit buyer faces loan claim The owners o...
4,High fuel prices hit BA s profits British Airw...
5,Pernod takeover talk lifts Domecq Shares in UK...


### Defining G20 Countries and Nationalities

In [106]:
G20=["Argentina", "Australia", "Brazil", "Canada", "China", "Germany", "France", "India", "Indonesia",\
      "Japan", "Mexico", "Russia", \
     "Saudi Arabia", "Italy","South Africa", "Korea", "Turkey", "UK","US","USA","United States","United Kingdom","America",\
     ]
Europe=["Austria","Belgium","Latvia",\
"Bulgaria","Lithuania","Croatia","Luxembourg","Cyprus","Malta","Czechia","Netherlands",\
"Denmark","Poland","Estonia","Portugal","Finland","Romania","France","Slovakia","Slovenia",\
"Greece","Spain","Hungary","Sweden","Europe",\
            "Italian","British","Hungary","Finland","Finnish","Sweden",
            "European","Belgian","Latvian","Polish","Czech","Swedish","Swiss","Spanish",\
             "greek","Dutch","Croatian"]
UK=["England", "Scotland", "Wales","Ireland","Scottish","Irish"]

In [107]:
Nationality=["Chinese","Russian","South African","Korean","Argentine","Argentinean","Australian","Brazilian","Canadian",\
               "Japanese","Mexican","American","Turkish","Indian","Indonesian","German","Italian"]

In [108]:
Nationality_dict={"Chinese":"China","Russian":"Russia",\
                  "South African":"South African","Korean":"Korea",\
                  "Argentine":"Argentina","Argentinean":"Argentina",\
                  "Australian":"Australia","Brazilian":"Brazil","Canadian":"Canada",\
               "Japanese":"Japan","Mexican":"Mexico",\
                  "American":"US","Turkish":"Turkey","Indian":"India","Indonesian":"Indonesia",\
                  "German":"Germany","Italian":"Italy","America":"US","American":"US","United States":"US",\
                  "United Kingdom":"UK","South African":"South Africa","USA":"US"}

### Removing stopwords, numbers

In [109]:
all_stopwords = nlp.Defaults.stop_words

In [110]:
all_stopwords.add("The")
all_stopwords.add("It")
all_stopwords.add("In")
all_stopwords.add("But")
all_stopwords.add("said")
all_stopwords.add("Mr")
all_stopwords.add("year")

In [111]:


def removestop(article):
    
    text_tokens = word_tokenize(article)
    tokens_without_sw= [word for word in text_tokens if not word.lower() in all_stopwords and word.isalpha() and len(word)>1]

    return (" ".join(tokens_without_sw))

In [112]:
df["article"]=df["article"].apply(lambda x: removestop(x))

### Entity Recognition using Spacy NLP

In [113]:
def entityrec(article):
    return list({(ent.text) for ent in nlp(article).ents if ent.label_ not in ("DATE","MONEY","TIME","PERCENT",\
                                                                                     "QUANTITY","ORDINAL","CARDINAL","PERSON")})
    

### Example- For the above published article 4, the recognised entities are as follows

In [114]:
entityrec(df.loc[4][0])

['Airbus',
 'Club World',
 'United States BA',
 'BA',
 'British Airways',
 'Dresdner Kleinwort Wasserstein',
 'Brul BNP Paribas']

In [115]:
df["entities"]=df["article"].apply(lambda x : " ".join(entityrec(x)))

In [116]:
df["entities_list"]=df["article"].apply(entityrec)

In [117]:
df.head()

Unnamed: 0_level_0,article,entities,entities_list
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Ad sales boost Time Warner profit Quarterly pr...,Securities Exchange Commission SEC AOL Time Wa...,"[Securities Exchange Commission, SEC, AOL Time..."
2,Dollar gains Greenspan speech dollar hit highe...,Friday Federal Reserve Bank America China Fede...,"[Friday Federal Reserve, Bank America, China, ..."
3,Yukos unit buyer faces loan claim owners embat...,Legal Russian State Russia Yukos Menatep Group...,"[Legal, Russian, State, Russia, Yukos, Menatep..."
4,High fuel prices hit BA profits British Airway...,Airbus Club World United States BA BA British ...,"[Airbus, Club World, United States BA, BA, Bri..."
5,Pernod takeover talk lifts Domecq Shares UK dr...,Scotland Allied Chivas Regal Scotch Paris Fren...,"[Scotland, Allied, Chivas Regal Scotch, Paris,..."


In [118]:
df["entities"][4]

'Airbus Club World United States BA BA British Airways Dresdner Kleinwort Wasserstein Brul BNP Paribas'

In [119]:
df["entities_list"][4]

['Airbus',
 'Club World',
 'United States BA',
 'BA',
 'British Airways',
 'Dresdner Kleinwort Wasserstein',
 'Brul BNP Paribas']

### Function for Comparing Nationality and entity 

In [120]:
def directcountries(entities):
    newlist=[]
    x=" ".join(entities)
    for i in itertools.chain(G20,UK,Europe,Nationality):
        
        if i in x:
            newlist.append(i)
    return newlist

### Example- in first article these are the countries directly mentioned

In [123]:
df.loc[4][0]

'High fuel prices hit BA profits British Airways blamed high fuel prices drop profits Reporting results months December airline pre tax profit compared earlier Rod Eddington BA chief executive results respectable quarter fuel costs rose BA profits better market expectation expects rise revenues help offset increased price aviation fuel BA introduced fuel surcharge passengers October increased way long haul flights short haul surcharge raised leg aviation analyst Mike Powell Dresdner Kleinwort Wasserstein says BA estimated annual surcharge revenues way short additional fuel costs predicted extra Turnover quarter benefiting rise cargo revenue Looking ahead results March BA warned yields average revenues passenger expected decline continues lower prices face competition low cost carriers sales better previously forecast March total revenue outlook slightly better previous guidance improvement anticipated BA chairman Martin Broughton BA previously forecast rise revenue reported Friday pass

In [25]:
directcountries(df["entities_set"][4])


['United States', 'British']

In [124]:
df["direct"]=df["entities_list"].apply(lambda x : directcountries(x))

In [125]:
df["direct"].head()

filename
1                               [Europe, German]
2    [China, America, Europe, European, Chinese]
3                              [Russia, Russian]
4                       [United States, British]
5                 [France, UK, Scotland, France]
Name: direct, dtype: object

### Creating a new column by Removing directly implied countries from the original entities and then now search for indirectly implied countries

In [126]:
df["entities_list"][4]

['Airbus',
 'Club World',
 'United States BA',
 'BA',
 'British Airways',
 'Dresdner Kleinwort Wasserstein',
 'Brul BNP Paribas']

In [138]:
df["direct"][4]

['United States', 'British']

In [139]:
def remove_direct_countries(x):
    entities_list = x['entities_list']
    direct = x['direct']
    
    for entity in entities_list:
        for country in direct:
            if country in entity:
                entities_list.remove(entity)
    return entities_list


In [142]:
remove_direct_countries(df.loc[4])

['Airbus',
 'Club World',
 'BA',
 'Dresdner Kleinwort Wasserstein',
 'Brul BNP Paribas']

In [143]:
df["entities_list"]=remove_direct_countries(df)

In [144]:
df["entities_list"][4]

['Airbus',
 'Club World',
 'BA',
 'Dresdner Kleinwort Wasserstein',
 'Brul BNP Paribas']

In [145]:
df.loc[4][3]

['United States', 'British']

In [146]:
df.head()

Unnamed: 0_level_0,article,entities,entities_list,direct
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Ad sales boost Time Warner profit Quarterly pr...,Securities Exchange Commission SEC AOL Time Wa...,"[Securities Exchange Commission, SEC, AOL Time...","[Europe, German]"
2,Dollar gains Greenspan speech dollar hit highe...,Friday Federal Reserve Bank America China Fede...,"[Friday Federal Reserve, Bank America, China, ...","[China, America, Europe, European, Chinese]"
3,Yukos unit buyer faces loan claim owners embat...,Legal Russian State Russia Yukos Menatep Group...,"[Legal, Russian, State, Russia, Yukos, Menatep...","[Russia, Russian]"
4,High fuel prices hit BA profits British Airway...,Airbus Club World United States BA BA British ...,"[Airbus, Club World, BA, Dresdner Kleinwort Wa...","[United States, British]"
5,Pernod takeover talk lifts Domecq Shares UK dr...,Scotland Allied Chivas Regal Scotch Paris Fren...,"[Scotland, Allied, Chivas Regal Scotch, Paris,...","[France, UK, Scotland, France]"


In [148]:
df["entities_list"][4]

['Airbus',
 'Club World',
 'BA',
 'Dresdner Kleinwort Wasserstein',
 'Brul BNP Paribas']

### Using Wikipedia Web pages summary for extracting country

In [149]:
import wikipedia
from pprint import pprint

### Examples- the number of wikipages with Trump as a keyword

In [150]:
wikipedia.search("trump")

['Donald Trump',
 'Melania Trump',
 'Trump',
 'Ivanka Trump',
 'Fred Trump',
 'Tiffany Trump',
 'The Trump Organization',
 'Ivana Trump',
 'Eric Trump',
 'Family of Donald Trump']

In [151]:
(wikipedia.search("Labour party"))

['Labour Party (UK)',
 'Labour Party',
 '2020 Labour Party leadership election',
 'Leader of the Labour Party (UK)',
 'Labour Party (Norway)',
 'Labour Party (Ireland)',
 'New Zealand Labour Party',
 'Deputy leader of the Labour Party (UK)',
 'Labour Party (Malta)',
 'Labour Party (Netherlands)']

In [153]:
re.sub(r'\([^)].*\)', '', wikipedia.page((wikipedia.search("Airbus"))[0]).summary[:200])

"Airbus SE  is a European multinational aerospace corporation. In 2019, Airbus was the world's largest airliner manufacturer"

### Defining a function for knowing the nationality of an entity from wikipedia's first few lines of summary

In [154]:
def getting_relavant_wikipedia_pages_and_country(x):
    result=" "
    for i in x:
        if (len(wikipedia.search(i))!=0):
            try:
                info = wikipedia.page((wikipedia.search(i))[0]).summary[:200]
                first_page_summary = re.sub(r'\([^)].*\)', '', info)
                
                result=result+(first_page_summary)
            except:
                continue
    
    k= [(ent.text) for ent in nlp(result).ents if ent.label_ in ("GPE","LOC") ]
    country=[]
    
    
    for i in itertools.chain(G20,UK,Europe,Nationality):

        if i in k:
            country.append(i)
    return country
    
        

In [156]:
df.head()

Unnamed: 0_level_0,article,entities,entities_list,direct
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Ad sales boost Time Warner profit Quarterly pr...,Securities Exchange Commission SEC AOL Time Wa...,"[Securities Exchange Commission, SEC, AOL Time...","[Europe, German]"
2,Dollar gains Greenspan speech dollar hit highe...,Friday Federal Reserve Bank America China Fede...,"[Friday Federal Reserve, Bank America, China, ...","[China, America, Europe, European, Chinese]"
3,Yukos unit buyer faces loan claim owners embat...,Legal Russian State Russia Yukos Menatep Group...,"[Legal, Russian, State, Russia, Yukos, Menatep...","[Russia, Russian]"
4,High fuel prices hit BA profits British Airway...,Airbus Club World United States BA BA British ...,"[Airbus, Club World, BA, Dresdner Kleinwort Wa...","[United States, British]"
5,Pernod takeover talk lifts Domecq Shares UK dr...,Scotland Allied Chivas Regal Scotch Paris Fren...,"[Scotland, Allied, Chivas Regal Scotch, Paris,...","[France, UK, Scotland, France]"


In [158]:
getting_relavant_wikipedia_pages_and_country(df["entities_list"].loc[4])

['England', 'Belgium']

In [43]:
df["indirect"]=df["entities_list"].apply(lambda x : getting_relavant_wikipedia_pages_and_country(x))

### Defining a total column for directly and indirectly implied countries

In [45]:
df["total"]=df.apply(lambda x : list(set(x["direct"]+x["indirect"])),axis=1)

In [46]:
df.head()

Unnamed: 0_level_0,article,entities,entities_set,direct,indirect,total
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Ad sales boost Time Warner profit Quarterly pr...,SEC AOL Europe AOL Time Warner Google Time War...,"[SEC, AOL Europe, AOL Time Warner, Google, Tim...","[US, Europe, German]",[Germany],"[Europe, German, US, Germany]"
2,Dollar gains Greenspan speech dollar hit highe...,White House Friday Federal Reserve Bank Americ...,"[White House, Friday Federal Reserve, Bank Ame...","[China, US, America, Europe, European, Chinese]",[England],"[America, China, England, Europe, Chinese, Eur..."
3,Yukos unit buyer faces loan claim owners embat...,Legal Russian State Russia Yukos Reuters Yugan...,"[Legal, Russian, State, Russia, Yukos, Reuters...","[Russia, US, Russian]",[Russia],"[US, Russian, Russia]"
4,High fuel prices hit BA profits British Airway...,Airbus Club World United States BA BA British ...,"[Airbus, Club World, United States BA, BA, Bri...","[United States, British]","[England, Belgium]","[United States, Belgium, British, England]"
5,Pernod takeover talk lifts Domecq Shares UK dr...,Scotland Allied Chivas Regal Scotch Paris Fren...,"[Scotland, Allied, Chivas Regal Scotch, Paris,...","[France, UK, US, Scotland, France]","[England, Scotland]","[Scotland, France, England, UK, US]"


In [47]:
df.to_csv('business1.csv', index=False)

In [48]:
df1=df[["article","total"]]

In [49]:
df1.head()

Unnamed: 0_level_0,article,total
filename,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Ad sales boost Time Warner profit Quarterly pr...,"[Europe, German, US, Germany]"
2,Dollar gains Greenspan speech dollar hit highe...,"[America, China, England, Europe, Chinese, Eur..."
3,Yukos unit buyer faces loan claim owners embat...,"[US, Russian, Russia]"
4,High fuel prices hit BA profits British Airway...,"[United States, Belgium, British, England]"
5,Pernod takeover talk lifts Domecq Shares UK dr...,"[Scotland, France, England, UK, US]"


### Defining a function for dealing with similar words like Chinese and Chine, both refer to China

In [50]:
df1.total[2]

['America', 'China', 'England', 'Europe', 'Chinese', 'European', 'US']

In [75]:
def countries_Nationalities(x):
    final_list=[]
    for i in x:
              
        if i in list(Nationality_dict.keys()):
            final_list.append(Nationality_dict[i])
        elif i in G20:
            final_list.append(i)  
        elif i in Europe:
            final_list.append("Europe")
        elif i in UK:
            final_list.append("UK")
    return list(set(final_list))

In [76]:
countries_Nationalities(df1.total[2])

['Europe', 'US', 'China', 'UK']

In [77]:
df1["final_total"]=df1["total"].apply(lambda x : countries_Nationalities(x))

### 2 countries were referred in 104 articles, 1 country was referred in 99 articles and so on

In [78]:
df1["final_total"].apply(lambda x : len(x)).value_counts()

1    151
2    135
3     99
4     44
5     33
0     27
7     10
6      7
8      3
9      1
Name: final_total, dtype: int64

In [79]:
df1.head()

Unnamed: 0_level_0,article,total,final_total
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Ad sales boost Time Warner profit Quarterly pr...,"[Europe, German, US, Germany]","[Europe, US, Germany]"
2,Dollar gains Greenspan speech dollar hit highe...,"[America, China, England, Europe, Chinese, Eur...","[Europe, US, China, UK]"
3,Yukos unit buyer faces loan claim owners embat...,"[US, Russian, Russia]","[US, Russia]"
4,High fuel prices hit BA profits British Airway...,"[United States, Belgium, British, England]","[Europe, US, UK]"
5,Pernod takeover talk lifts Domecq Shares UK dr...,"[Scotland, France, England, UK, US]","[US, France, UK]"


In [80]:
df2=pd.get_dummies(df1.final_total.apply(pd.Series).stack()).sum(level=0)

In [81]:
df2.head()

Unnamed: 0_level_0,Argentina,Australia,Brazil,Canada,China,Europe,France,Germany,India,Indonesia,Italy,Japan,Korea,Mexico,Russia,Saudi Arabia,South Africa,Turkey,UK,US
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1
5,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1


In [82]:
df2.shape

(483, 20)

### No of times each country was implied in total

In [83]:
df2.sum(axis = 0, skipna = True)

Argentina         8
Australia        27
Brazil           14
Canada           16
China            71
Europe          211
France           41
Germany          87
India            67
Indonesia        16
Italy            25
Japan            61
Korea            15
Mexico            6
Russia           49
Saudi Arabia     10
South Africa     16
Turkey           11
UK              170
US              283
dtype: int64

### First article has references to 3 countries, 2nd article to 4 countries and so on

In [89]:
df2.sum(axis=1).head()

filename
1    3
2    4
3    2
4    3
5    3
dtype: int64

### Topic Extraction

In [59]:
df_topic=df["article"]

In [60]:
df_topic

filename
1      Ad sales boost Time Warner profit Quarterly pr...
2      Dollar gains Greenspan speech dollar hit highe...
3      Yukos unit buyer faces loan claim owners embat...
4      High fuel prices hit BA profits British Airway...
5      Pernod takeover talk lifts Domecq Shares UK dr...
6      Japan narrowly escapes recession Japan economy...
7      Jobs growth slow US US created fewer jobs expe...
8      India calls fair trade rules India attends mee...
9      Ethiopia crop production Ethiopia produced mil...
10     Court rejects tobacco case US government claim...
11     Ask Jeeves tips online ad revival Ask Jeeves l...
12     Indonesians face fuel price rise Indonesia gov...
13     Peugeot deal boosts Mitsubishi Struggling Japa...
14     Telegraph newspapers axe jobs Daily Sunday Tel...
15     Air passengers win new EU rights Air passenger...
16     China keeps tight rein credit China efforts st...
17     Parmalat boasts doubled profits Parmalat Itali...
18     India rupee hit

In [94]:
from nltk.tokenize import word_tokenize, sent_tokenize
from functools import reduce

def tokenizer(text):
    
    tokens = [word_tokenize(sent) for sent in sent_tokenize(text)]
    tokens = list(reduce(lambda x,y: x+y, tokens))

    return tokens

In [95]:
df.loc[3][0]

'Yukos unit buyer faces loan claim owners embattled Russian oil giant Yukos ask buyer production unit pay loan State owned Rosneft bought Yugansk unit sale forced Russia settle tax claim Yukos Yukos owner Menatep Group says ask Rosneft repay loan Yugansk secured assets Rosneft faces similar repayment demand foreign banks Legal experts Rosneft purchase Yugansk include obligations pledged assets Rosneft pay real money creditors avoid seizure Yugansk assets Moscow based US lawyer Jamie Firestone connected case Menatep Group managing director Tim Osborne told Reuters news agency If default fight rule law exists international arbitration clauses credit Rosneft officials unavailable comment company intends action Menatep recover tax claims debts owed Yugansk Yukos filed bankruptcy protection US court attempt prevent forced sale main production arm sale went ahead December Yugansk sold little known shell company turn bought Rosneft Yukos claims downfall punishment political ambitions founder 

In [96]:
tokenizer(df.loc[3][0])

['Yukos',
 'unit',
 'buyer',
 'faces',
 'loan',
 'claim',
 'owners',
 'embattled',
 'Russian',
 'oil',
 'giant',
 'Yukos',
 'ask',
 'buyer',
 'production',
 'unit',
 'pay',
 'loan',
 'State',
 'owned',
 'Rosneft',
 'bought',
 'Yugansk',
 'unit',
 'sale',
 'forced',
 'Russia',
 'settle',
 'tax',
 'claim',
 'Yukos',
 'Yukos',
 'owner',
 'Menatep',
 'Group',
 'says',
 'ask',
 'Rosneft',
 'repay',
 'loan',
 'Yugansk',
 'secured',
 'assets',
 'Rosneft',
 'faces',
 'similar',
 'repayment',
 'demand',
 'foreign',
 'banks',
 'Legal',
 'experts',
 'Rosneft',
 'purchase',
 'Yugansk',
 'include',
 'obligations',
 'pledged',
 'assets',
 'Rosneft',
 'pay',
 'real',
 'money',
 'creditors',
 'avoid',
 'seizure',
 'Yugansk',
 'assets',
 'Moscow',
 'based',
 'US',
 'lawyer',
 'Jamie',
 'Firestone',
 'connected',
 'case',
 'Menatep',
 'Group',
 'managing',
 'director',
 'Tim',
 'Osborne',
 'told',
 'Reuters',
 'news',
 'agency',
 'If',
 'default',
 'fight',
 'rule',
 'law',
 'exists',
 'international',


In [63]:
df["tokens"]=df["article"].apply(tokenizer)

In [64]:
tokenized_words=reduce(lambda x, y: x + y, df["tokens"])

In [65]:
from collections import Counter
counter = Counter(tokenized_words)
counter.most_common(10)

[('US', 804),
 ('market', 427),
 ('company', 410),
 ('growth', 363),
 ('firm', 362),
 ('economy', 350),
 ('government', 329),
 ('new', 326),
 ('sales', 292),
 ('economic', 278)]

In [97]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=10, analyzer='word', ngram_range=(1, 2), stop_words='english')
vz = vectorizer.fit_transform(df["article"])

print(vz.shape)

(510, 1605)


In [67]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.metrics import silhouette_score

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

### Dividing the articles into 10 Clusters

In [68]:
num_clusters = 10
kmeans_model = MiniBatchKMeans(n_clusters=num_clusters, init='k-means++', n_init=1, random_state=42,                       
                         init_size=1000, batch_size=1000, verbose=False, max_iter=1000, )
kmeans = kmeans_model.fit(vz)
kmeans_clusters = kmeans.predict(vz)
kmeans_distances = kmeans.transform(vz)

In [69]:
for (i, desc) in (enumerate(df.article)):
    if(i < 5):
        print("Cluster " + str(kmeans_clusters[i]) + ": " + desc + 
              "(distance: " + str(kmeans_distances[i][kmeans_clusters[i]]) + ")")
        print('---')

Cluster 2: Ad sales boost Time Warner profit Quarterly profits US media giant TimeWarner jumped months December earlier firm biggest investors Google benefited sales high speed internet connections higher advert sales TimeWarner fourth quarter sales rose Its profits buoyed gains offset profit dip Warner Bros users AOL Time Warner Friday owns search engine Google internet business AOL mixed fortunes lost subscribers fourth quarter profits lower preceding quarters However company AOL underlying profit exceptional items rose stronger internet advertising revenues hopes increase subscribers offering online service free TimeWarner internet customers try sign AOL existing customers high speed broadband TimeWarner restate results following probe US Securities Exchange Commission SEC close concluding Time Warner fourth quarter profits slightly better analysts expectations film division saw profits slump helped box office flops Alexander Catwoman sharp contrast earlier final film Lord Rings tri

### Getting the Hot keywords in those Clusters

In [70]:
sorted_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
all_keywords = []
for i in range(num_clusters):
    topic_keywords = []
    for j in sorted_centroids[i, :15]:
        topic_keywords.append(terms[j])
    all_keywords.append(topic_keywords)

keywords_df = pd.DataFrame(index=['topic_{0}'.format(i) for i in range(num_clusters)], 
                           columns=['keyword_{0}'.format(i) for i in range(15)],
                           data=all_keywords)
keywords_df

Unnamed: 0,keyword_0,keyword_1,keyword_2,keyword_3,keyword_4,keyword_5,keyword_6,keyword_7,keyword_8,keyword_9,keyword_10,keyword_11,keyword_12,keyword_13,keyword_14
topic_0,prices,crude,oil,house,mortgage,barrel,price,housing,house prices,market,lending,figures,uk,oil prices,rose
topic_1,yukos,russian,gazprom,yugansk,oil,rosneft,russia,court,khodorkovsky,tax,auction,bankruptcy,yuganskneftegas,sale,unit
topic_2,sales,profits,gm,car,euros,profit,growth,retail,company,executive,quarter,christmas,chief executive,strong,stores
topic_3,firm,company,shares,deal,offer,takeover,new,bid,worldcom,club,ebbers,financial,group,uk,euros
topic_4,economy,growth,rate,rates,economic,consumer,manufacturing,bank,spending,figures,quarter,unemployment,jobs,rise,january
topic_5,insurance,stock,bank,sec,market,sri,sri lanka,lanka,stock market,shares,firms,indonesia,tsunami,disaster,damage
topic_6,dollar,eu,deficit,budget,economic,trade,countries,economy,euro,bush,european,currency,imf,growth,president
topic_7,airline,boeing,airlines,air,airbus,aircraft,planes,fuel,costs,passengers,carrier,staff,india,euros,company
topic_8,india,oil,indian,gas,company,rupees,foreign,car,government,market,production,investment,deal,country,firm
topic_9,china,chinese,yuan,beijing,projects,banking,government,trade,economy,state,growth,oil,bank,prices,environmental


### 10 Most common Topics

Topic-0
Oil prices, house prices

Topic-1
Russia's Oil Production

Topic-2
Europe Automotive industry results 

Topic-3
UK company shares and bidding

Topic-4
Growth in Economy, Unemployement and Jobs

Topic-5
South East Asia market 

Topic-6
Trade and Budget deficits

Topic-7
Airlines, costs and passengers

Topic-8
India Oil and Gas production

Topic-9
China government,trade and economy