Import statements

In [1]:
#import statements
import csv
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import PunktSentenceTokenizer
import pickle
from nltk.stem import *
from nltk import word_tokenize
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer
import nltk.text

In [2]:
# for python 2 only
import sys  

reload(sys)  
sys.setdefaultencoding('utf8')

Relative locations of datasets

In [3]:
AP_NEWS_DIR = 'datasets/apnews'  
ALL_NEWS_DIR = 'datasets/all-the-news' #https://www.kaggle.com/snapcrack/all-the-news/data

Loading Punctuation for english language. You might need to dowload using nltk.download()

In [4]:
english_sent_tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle')

__Description__:<br>
This method removes the location from the beginning of the news article.<br> 
__Input__ :<br>
News Article Content<br> 
__Output__:<br>
Article with location removed<br>
__Example__:<br>
Washington - President announces ..." is changed to "President announces ..."

In [5]:
def remove_location_from_news(text):
    ts = text.split('—')
    if(len(ts[0])< 35):
        #print(ts[0])
        return '—'.join(ts[1:])
    return text
        

__Description__:<br>
This method splits the article into meaningful sentences, lemmatizes the words and removes puntuation from the input text.<br>
__Input__ :<br>
>text: News Article Content or title<br>
use_lemmatizer: Whether to use lemmatizer or not<br>
use_stemmer : Whether to use lemmatizer or not<br>
interval: Number of rows(interval) to print status of processing

__Output__:<br>
Processed News Article Content or title<br>

In [6]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
def tokenize_text(text, use_lemmatizer = True, use_stemmer = False,interval=1000):
    global error_count, run_count
    text = remove_location_from_news(text)
    run_count+=1
    if(run_count%interval==1):
        print(run_count)
    try:
        #print(text)
        sent_l = english_sent_tokenizer.tokenize(text)
        if(use_lemmatizer):
            sent_l = [' '.join([w_al for w_al in [lemmatizer.lemmatize(w) for w in nltk.wordpunct_tokenize(sent)] if w_al.isalnum()]) for sent in sent_l]
        if(use_stemmer):
            sent_l = [' '.join([w_al for w_al in [stemmer.stem(w) for w in nltk.wordpunct_tokenize(sent)] if w_al.isalnum()]) for sent in sent_l]
        #print(sent_l)
        return sent_l
    except Exception as e:
        print(e)
        print("Couldn't tokenize :")
        error_count+=1
        #print((text))
        return [text]

__Description__:<br>
This method is a wrapper that preprocesses the title and content of the news dataframe.<br> 
__Input__ :<br>
News data frame with 'content' and 'title' columns<br> 
__Output__:<br>
Processed News data frame with 'content' and 'title' columns<br>

In [7]:
def parse_dataframe(df):
    df['content']=df['content'].apply(lambda x : tokenize_text(x))
    df['title']=df['title'].apply(lambda x : tokenize_text(x))
    return df

**Description:**<br>
This method converts a dataframe into a pickle format<br>
**Input:**<br>
Dataframe<br>
**Output:**<br>
Dumped pickle file containing heads, desc, and keywords (not used when training)<br>

In [8]:
def tuple2pickle(df):
    heads, desc = [], []
    for index, row in df.iterrows():
        heads.extend(row['title'])
        desc.extend(row['content'])
    with open('pickles/all-the-news-2.pickle', 'wb') as f:
        pickle.dump([heads, desc, None], f, pickle.HIGHEST_PROTOCOL)

__Description__:<br>
This method loads, cleans, preprocesses and returns the "all the news" dataset as a dataframe with 'content' and 'title' columns.<br> 
__Input__ :<br>
>partial: Whether to load partial data or complete data<br>
rows: Number of rows to be processed if partial is True

__Output__:<br>
Processed News data frame with 'content' and 'title' columns<br>

In [9]:
error_count = 0
run_count = 0 
def get_all_news_df(partial= True,rows = 5000):
    global error_count, run_count
    error_count = 0
    run_count = 0 
    df = pd.read_csv(ALL_NEWS_DIR+'/articles1.csv')
    df = df.append(pd.read_csv(ALL_NEWS_DIR+'/articles2.csv'))
    df = df.append(pd.read_csv(ALL_NEWS_DIR+'/articles3.csv'))
    df2 = df[['title','content']]
    if(partial):
        df3 = parse_dataframe(df2.head(rows))
    else: 
        df3 = parse_dataframe(df2)
    tuple2pickle(df3)
    return df3

In [10]:
df3 = get_all_news_df()

In [11]:
df3.head()

Unnamed: 0,title,content
0,[House Republicans Fret About Winning Their He...,[ Congressional Republicans have a new fear ...
1,[Rift Between Officers and Residents as Killin...,"[After the bullet shells get counted, the bloo..."
2,"[Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial...","[When Walt Disney’s “Bambi” opened in 1942, cr..."
3,"[Among Deaths in 2016, a Heavy Toll in Pop Mus...","[Death may be the great equalizer, but it isn’..."
4,[Kim Jong-un Says North Korea Is Preparing to ...,"[ North Korea’s leader, Kim said on Sunday..."
