Import statements

In [5]:
#import statements
import csv
import os
import pandas as pd
import numpy as np
import nltk
import json
from nltk.tokenize import PunktSentenceTokenizer
import pickle
from nltk.stem import *
from nltk import word_tokenize
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer
import nltk.text
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/satyavasanthreddytumati/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Relative locations of datasets
Give the number of rows (examples) wished to extract

In [6]:
ALL_NEWS_DIR = 'datasets/all-the-news' #https://www.kaggle.com/snapcrack/all-the-news/data
JSONL_DATA = "datasets/signalmedia-1m.jsonl"

Loading Punctuation for english language. You might need to dowload using nltk.download()

In [7]:
english_sent_tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle')

__Description__:<br>
This method removes the location from the beginning of the news article.<br> 
__Input__ :<br>
News Article Content<br> 
__Output__:<br>
Article with location removed<br>
__Example__:<br>
Washington - President announces ..." is changed to "President announces ..."

In [8]:
def remove_location_from_news(text):
    ts = text.split('—')
    if(len(ts[0])< 35):
        #print(ts[0])
        return '—'.join(ts[1:])
    return text
        

__Description__:<br>
This method splits the article into meaningful sentences, lemmatizes the words and removes puntuation from the input text.<br>
__Input__ :<br>
>text: News Article Content or title<br>
use_lemmatizer: Whether to use lemmatizer or not<br>
use_stemmer : Whether to use lemmatizer or not<br>
interval: Number of rows(interval) to print status of processing

__Output__:<br>
Processed News Article Content or title<br>

In [9]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
def tokenize_text(text, use_lemmatizer = True, use_stemmer = False,interval=10000, remove_location = False):
    global error_count, run_count
    if(remove_location):
        text = remove_location_from_news(text)
    run_count+=1
    if(run_count%interval==1):
        print("tokenizer",run_count)
    try:
        #print(text)
        sent_l = english_sent_tokenizer.tokenize(text)
        if(use_lemmatizer):
            sent_l = [' '.join([w_al for w_al in [lemmatizer.lemmatize(w) for w in nltk.wordpunct_tokenize(sent)] if w_al.isalnum()]) for sent in sent_l]
        if(use_stemmer):
            sent_l = [' '.join([w_al for w_al in [stemmer.stem(w) for w in nltk.wordpunct_tokenize(sent)] if w_al.isalnum()]) for sent in sent_l]
        #print(sent_l)
        return sent_l
    except Exception as e:
        print(e)
        print("Couldn't tokenize :")
        error_count+=1
        #print((text))
        return [text]

__Description__:<br>
This method is a wrapper that preprocesses the title and content of the news dataframe.<br> 
__Input__ :<br>
News data frame with 'content' and 'title' columns<br> 
__Output__:<br>
Processed News data frame with 'content' and 'title' columns<br>

In [10]:
def parse_dataframe(df):
    df['content']=df['content'].apply(lambda x : tokenize_text(x))
    df['title']=df['title'].apply(lambda x : tokenize_text(x))
    return df

In [11]:
def tuple2pickle(df,nr):
    heads, desc = [], []
    for index, row in df.iterrows():
        if(len(row['title'])>=1):
            heads.append(row['title'])
            desc.append(row['content'])
    with open('pickles/all-the-news_'+nr+'.pickle', 'wb') as f:
        pickle.dump([heads, desc, None], f, pickle.HIGHEST_PROTOCOL)
    print('Extracting rows into ', 'pickles/all-the-news_'+nr+'.pickle')

In [12]:
def convert_jsonl_tocsv(path,rows=1000000):
    try:
        os.remove("temp_store.csv")
    except OSError:
        pass
    temp_out = open("temp_store.csv",'a+')
    csv_out=csv.writer(temp_out)
    csv_out.writerow(['title','content'])
    title_article_list = []
    with open(path) as infile:
        i = 0 
        for line in infile:
            j = json.loads(line)
            content = j["content"].split()
            if(len(content)>201):
                content = content[:200]
            content = " ".join(content)
            #print(content)
            title_article_list.append((j["title"],content))
            i+=1
            if(i==rows):
                for x in title_article_list  :
                    csv_out.writerow(x)
                break
            if(i%10000 == 0):
                print("extract",i)
                for x in title_article_list  :
                    csv_out.writerow(x)
                title_article_list = []


          

In [13]:
convert_jsonl_tocsv(JSONL_DATA,10)

__Description__:<br>
This method loads, cleans, preprocesses and returns the "all the news" dataset as a dataframe with 'content' and 'title' columns.<br> 
__Input__ :<br>
>partial: Whether to load partial data or complete data<br>
rows: Number of rows to be processed if partial is True

__Output__:<br>
Processed News data frame with 'content' and 'title' columns<br>

In [14]:
error_count = 0
run_count = 0 
def parse_and_store(rows, PATH):
    global error_count, run_count
    error_count = 0
    run_count = 0
    if(PATH == ALL_NEWS_DIR):
        df = pd.read_csv(ALL_NEWS_DIR+'/articles1.csv')
        df = df.append(pd.read_csv(ALL_NEWS_DIR+'/articles2.csv'))
        df = df.append(pd.read_csv(ALL_NEWS_DIR+'/articles3.csv'))
        df2 = df[['title','content']]
        df3 = parse_dataframe(df2)
        with open('pickles/all_news'+str(rows)+'.pickle', 'wb') as fp:
            pickle.dump(df3,fb)
        return df3
    elif(PATH == JSONL_DATA) :
        convert_jsonl_tocsv(PATH,rows)
        df3 = parse_dataframe(pd.read_csv("temp_store.csv"))
        with open('pickles/json_news_'+str(rows)+'.pickle', 'wb') as fp:
            pickle.dump(df3,fp)

        
        

In [18]:
parse_and_store(1000,JSONL_DATA)

tokenizer 1
