In [1]:
# import dependencies
import numpy as np
import pandas as pd
!pip install gensim
from sklearn import utils 
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from sklearn.model_selection import train_test_split




In [2]:
# Read data set
df = pd.read_csv('uci-news-aggregator.csv') 
df.head(3)

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
0,1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
1,2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207
2,3,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550


In [3]:
a=df['TITLE'][1]
print(a)

Fed's Charles Plosser sees high bar for change in pace of tapering


In [4]:
df.columns

Index(['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME',
       'TIMESTAMP'],
      dtype='object')

In [5]:
df['PUBLISHER'].value_counts() #frequency of each Publisher in the datset

Reuters                            3902
Huffington Post                    2455
Businessweek                       2395
Contactmusic.com                   2334
Daily Mail                         2254
                                   ... 
Broadway Blog \(blog\)                1
Breckenridge American                 1
SanFranciscoSentinel.com              1
Wicked Local North Attleborough       1
Journeyman Weekly                     1
Name: PUBLISHER, Length: 10985, dtype: int64

In [6]:
news=df[['TITLE','PUBLISHER']] #news datset containing only article text(TITLE) and publisher name
print(news.head())

                                               TITLE          PUBLISHER
0  Fed official says weak data caused by weather,...  Los Angeles Times
1  Fed's Charles Plosser sees high bar for change...           Livemint
2  US open: Stocks fall after Fed official hints ...       IFA Magazine
3  Fed risks falling 'behind the curve', Charles ...       IFA Magazine
4  Fed's Plosser: Nasty Weather Has Curbed Job Gr...          Moneynews


In [7]:
#filtering articles of more frequent publishers
filtered_publishers=(news['PUBLISHER'].value_counts()>2000 )& (news['PUBLISHER'].value_counts()<3000)

In [8]:
filtered_publishers[filtered_publishers]

Huffington Post     True
Businessweek        True
Contactmusic.com    True
Daily Mail          True
NASDAQ              True
Examiner.com        True
Name: PUBLISHER, dtype: bool

In [9]:
required_publishers=filtered_publishers[filtered_publishers].index
news_articles=news[news['PUBLISHER'].isin(required_publishers)]

In [10]:
news_articles.sample(20)

Unnamed: 0,TITLE,PUBLISHER
236419,"Angelina Jolie On Motherhood, Future Collabora...",Contactmusic.com
416011,Fugitive 'who violated his parole' is captured...,Daily Mail
398941,Watch Baby Loggerhead Sea Turtles Hatch On Flo...,Huffington Post
104653,4 Ways You May Be Able to Prepare and File You...,NASDAQ
316724,Good News on Health-Care Spending Is Making US...,Businessweek
144389,Moderate Gains for Technology Shares; Cbeyond ...,NASDAQ
305555,Puts Shrink Amid Longest Run of Market Calm Si...,Businessweek
33766,Anna Wintour - Anna Wintour Defends Kim Kardas...,Contactmusic.com
51225,Obamacare's 6-Million Target Hit as Exchange S...,Businessweek
411010,Movie review: 'Guardians of the Galaxy' a retu...,Examiner.com


In [11]:
news_articles['PUBLISHER'].value_counts()

Huffington Post     2455
Businessweek        2395
Contactmusic.com    2334
Daily Mail          2254
NASDAQ              2228
Examiner.com        2085
Name: PUBLISHER, dtype: int64

In [12]:
news_articles

Unnamed: 0,TITLE,PUBLISHER
5,Plosser: Fed May Have to Accelerate Tapering Pace,NASDAQ
15,Forex - Pound drops to one-month lows against ...,NASDAQ
19,"Euro Anxieties Wane as Bunds Top Treasuries, S...",Businessweek
20,Noyer Says Strong Euro Creates Unwarranted Eco...,Businessweek
63,EBay CEO Donahoe's Pay Drops by 53% to $13.8 M...,Businessweek
...,...,...
422198,Overweight kids are bullying targets,Examiner.com
422277,"Deadly Ebola could affect up to 20000 people, ...",Daily Mail
422300,UN: Ebola Could Eventually Infect 20000 People,Huffington Post
422309,"Ebola Cases May Surpass 20000, WHO Says in Upd...",Businessweek


In [13]:
def label_text(corpus, publisher_name) :
    doc = []
    text =enumerate(corpus)
    label = publisher_name
    doc.append(TaggedDocument(text.split(), [label]))
    return doc

In [14]:
news_articles['TITLE'].iloc[0]

'Plosser: Fed May Have to Accelerate Tapering Pace'

In [15]:
for index, row in news_articles.iterrows():
    row["TITLE"]=  row["TITLE"].lower().split()

In [16]:
news_articles.head()

Unnamed: 0,TITLE,PUBLISHER
5,"[plosser:, fed, may, have, to, accelerate, tap...",NASDAQ
15,"[forex, -, pound, drops, to, one-month, lows, ...",NASDAQ
19,"[euro, anxieties, wane, as, bunds, top, treasu...",Businessweek
20,"[noyer, says, strong, euro, creates, unwarrant...",Businessweek
63,"[ebay, ceo, donahoe's, pay, drops, by, 53%, to...",Businessweek


In [17]:
#Tokeniizing the article text and adding label
documents = []
for index, row in news_articles.iterrows():

    label = row["PUBLISHER"]
    documents.append(TaggedDocument(row["TITLE"], [label]))

In [18]:
documents[:10]

[TaggedDocument(words=['plosser:', 'fed', 'may', 'have', 'to', 'accelerate', 'tapering', 'pace'], tags=['NASDAQ']),
 TaggedDocument(words=['forex', '-', 'pound', 'drops', 'to', 'one-month', 'lows', 'against', 'euro'], tags=['NASDAQ']),
 TaggedDocument(words=['euro', 'anxieties', 'wane', 'as', 'bunds', 'top', 'treasuries,', 'spain', 'debt', 'rallies'], tags=['Businessweek']),
 TaggedDocument(words=['noyer', 'says', 'strong', 'euro', 'creates', 'unwarranted', 'economic', 'pressure', '(1)'], tags=['Businessweek']),
 TaggedDocument(words=['ebay', 'ceo', "donahoe's", 'pay', 'drops', 'by', '53%', 'to', '$13.8', 'million', 'for', '2013'], tags=['Businessweek']),
 TaggedDocument(words=["mcdonald's", 'blames', 'weak', 'sales', 'on', 'weather'], tags=['Huffington Post']),
 TaggedDocument(words=['stock', 'market', 'news', 'for', 'march', '10,', '2014', '-', 'market', 'news'], tags=['NASDAQ']),
 TaggedDocument(words=['u.s.', 'stocks', 'open', 'lower', 'in', 'subdued', 'trade;', 'dow', 'jones', 'do

In [19]:
len(documents)

13751

In [20]:

#Defining Doc2vec model and building vocabulary

model_dbow = Doc2Vec(dm=0, vector_size=3000, negative=5, min_count=1, alpha=0.09, min_alpha=0.09)
model_dbow.build_vocab(documents)

for epoch in range(30):
    model_dbow.train(utils.shuffle(documents), total_examples=len(documents), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

In [21]:
def D2V(news):
    # input_news=str(input('Please paste yur article here: '));
    input_news=str(news);
    vector_of_input_news= model_dbow.infer_vector(input_news.split())
    source_guess,source_similarity=model_dbow.dv.most_similar(vector_of_input_news, topn = 1)[0];
    return source_guess

In [22]:
# test_news=input('Please paste yur article here: ');
test_news='Rishi Sunak Wins Tory Leadership Race And Will Become Prime Minister'
result=D2V(test_news)
print('Predicted source of this input by our model is: '+ result)

Predicted source of this input by our model is: Examiner.com
