Import statements

In [16]:
#import statements
import csv
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import PunktSentenceTokenizer
import pickle
from nltk.stem import *
from nltk import word_tokenize
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer
import nltk.text

# Relative locations of datasets
Give the number of rows (examples) wished to extract

In [17]:
AP_NEWS_DIR = 'datasets/apnews'  
ALL_NEWS_DIR = '~/.kaggle/datasets/snapcrack/all-the-news' #https://www.kaggle.com/snapcrack/all-the-news/data
NUM_ROWS_EXTRACT = 50000

Loading Punctuation for english language. You might need to dowload using nltk.download()

In [18]:
english_sent_tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle')

__Description__:<br>
This method removes the location from the beginning of the news article.<br> 
__Input__ :<br>
News Article Content<br> 
__Output__:<br>
Article with location removed<br>
__Example__:<br>
Washington - President announces ..." is changed to "President announces ..."

In [19]:
def remove_location_from_news(text):
    ts = text.split('—')
    if(len(ts[0])< 35):
        #print(ts[0])
        return '—'.join(ts[1:])
    return text
        

__Description__:<br>
This method splits the article into meaningful sentences, lemmatizes the words and removes puntuation from the input text.<br>
__Input__ :<br>
>text: News Article Content or title<br>
use_lemmatizer: Whether to use lemmatizer or not<br>
use_stemmer : Whether to use lemmatizer or not<br>
interval: Number of rows(interval) to print status of processing

__Output__:<br>
Processed News Article Content or title<br>

In [20]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
def tokenize_text(text, use_lemmatizer = True, use_stemmer = False,interval=1000):
    global error_count, run_count
    text = remove_location_from_news(text)
    run_count+=1
    if(run_count%interval==1):
        print(run_count)
    try:
        #print(text)
        sent_l = english_sent_tokenizer.tokenize(text)
        if(use_lemmatizer):
            sent_l = [' '.join([w_al for w_al in [lemmatizer.lemmatize(w) for w in nltk.wordpunct_tokenize(sent)] if w_al.isalnum()]) for sent in sent_l]
        if(use_stemmer):
            sent_l = [' '.join([w_al for w_al in [stemmer.stem(w) for w in nltk.wordpunct_tokenize(sent)] if w_al.isalnum()]) for sent in sent_l]
        #print(sent_l)
        return sent_l
    except Exception as e:
        print(e)
        print("Couldn't tokenize :")
        error_count+=1
        #print((text))
        return [text]

__Description__:<br>
This method is a wrapper that preprocesses the title and content of the news dataframe.<br> 
__Input__ :<br>
News data frame with 'content' and 'title' columns<br> 
__Output__:<br>
Processed News data frame with 'content' and 'title' columns<br>

In [21]:
def parse_dataframe(df):
    df['content']=df['content'].apply(lambda x : tokenize_text(x))
    df['title']=df['title'].apply(lambda x : tokenize_text(x))
    return df

**Description:**<br>
This method converts a dataframe into a pickle format<br>
**Input:**<br>
Dataframe<br>
**Output:**<br>
Dumped pickle file containing heads, desc, and keywords (not used when training)<br>

In [22]:
def tuple2pickle(df,nr):
    heads, desc = [], []
    for index, row in df.iterrows():
        if(len(row['title'])>=1):
            heads.append(row['title'])
            desc.append(row['content'])
    with open('pickles/all-the-news_'+nr+'.pickle', 'wb') as f:
        pickle.dump([heads, desc, None], f, pickle.HIGHEST_PROTOCOL)
    print('Extracting rows into ', 'pickles/all-the-news_'+nr+'.pickle')

__Description__:<br>
This method loads, cleans, preprocesses and returns the "all the news" dataset as a dataframe with 'content' and 'title' columns.<br> 
__Input__ :<br>
>partial: Whether to load partial data or complete data<br>
rows: Number of rows to be processed if partial is True

__Output__:<br>
Processed News data frame with 'content' and 'title' columns<br>

In [23]:
error_count = 0
run_count = 0 
def get_all_news_df(partial= True,rows = 5000):
    global error_count, run_count
    error_count = 0
    run_count = 0 
    df = pd.read_csv(ALL_NEWS_DIR+'/articles1.csv')
    df = df.append(pd.read_csv(ALL_NEWS_DIR+'/articles2.csv'))
    df = df.append(pd.read_csv(ALL_NEWS_DIR+'/articles3.csv'))
    df2 = df[['title','content']]
    if(partial):
        df3 = parse_dataframe(df2.head(rows))
        tuple2pickle(df3,str(rows))
    else: 
        df3 = parse_dataframe(df2)
        tuple2pickle(df3,'all')
    return df3

In [24]:
df3 = get_all_news_df(True, NUM_ROWS_EXTRACT)

1
1001
2001
3001
4001
5001
6001
7001
8001
9001
10001
11001
12001
13001
14001
15001
16001
17001
18001
19001
20001
21001
22001
23001
24001
25001
26001
27001
28001
29001
30001
31001
32001
33001
34001
35001
36001
37001
38001
39001
40001
41001
42001
43001
44001
45001
46001
47001
48001
49001


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


50001
51001
52001
53001
54001
55001
56001
57001
58001
59001
60001
61001
62001
63001
64001
65001
66001
67001
68001
69001
70001
71001
72001
73001
74001
75001
76001
77001
78001
79001
80001
81001
82001
83001
84001
85001
86001
87001
88001
89001
90001
91001
92001
93001
94001
95001
96001
97001
98001
99001


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


Extracting rows into  pickles/all-the-news_50000.pickle


In [25]:
df3.head()

Unnamed: 0,title,content
0,[House Republicans Fret About Winning Their He...,[Congressional Republicans have a new fear whe...
1,[Rift Between Officers and Residents a Killing...,[After the bullet shell get counted the blood ...
2,[Tyrus Wong Bambi Artist Thwarted by Racial Bi...,[When Walt Disney s Bambi opened in 1942 criti...
3,[Among Deaths in 2016 a Heavy Toll in Pop Musi...,[Death may be the great equalizer but it isn t...
4,[Kim Jong un Says North Korea Is Preparing to ...,[North Korea s leader Kim said on Sunday that ...
