In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")
from wordcloud import WordCloud

In [2]:
df = pd.read_pickle('/Users/f75k8bx/Documents/MyGitHub/NLP_using_News_API/test.pkl')

In [3]:
df_Ind = df[df.country == 'India']

In [4]:
df_Ind.shape

In [5]:
df_Ind.head()

In [6]:
df_Ind.info(verbose=True, null_counts=True)

## Format Date Time 

In [7]:
df_Ind['publishedAt'][:5]

In [8]:
df_Ind['published_at'] =  pd.to_datetime(df_Ind['publishedAt'], format='%Y-%m-%dT%H:%M:%SZ').dt.date
df_Ind['published_at'][:5]

### There are - on an average - 1.5 to 2 sentences apart from the first line available.

## NLP FUNCTIONS

### Tokenizer

In [9]:
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import string

STOPWORDS = set(stopwords.words('english'))

def text_tokenizer(text):
    try:
        if text:
            sequences = sent_tokenize(text)
            seq_tokens = [word_tokenize(seq) for seq in sequences]
            no_punct_seq_tokens = []
            for seq_token in seq_tokens:
                no_punct_seq_tokens.append([token for token in seq_token if token not in string.punctuation and token not in STOPWORDS])
            return no_punct_seq_tokens
        else:
            return None
    except:
        print(text)
        return None

### Intializing POS tagger

In [10]:
import os
from nltk.tag import StanfordPOSTagger

path = os.getcwd()
path_to_stnfrd_core_nlp = path + '/stanford-postagger/'

jar = path_to_stnfrd_core_nlp + 'stanford-postagger.jar'
model = path_to_stnfrd_core_nlp + 'models/english-bidirectional-distsim.tagger'

st = StanfordPOSTagger(model, jar, encoding='utf8')

### Lemmatization

In [11]:
from nltk.corpus.reader.wordnet import VERB, NOUN, ADJ, ADV
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet

lm = WordNetLemmatizer()

dict_pos_map = {
    # Look for NN in the POS tag because all nouns begin with NN
    'NN': NOUN,
    # Look for VB in the POS tag because all nouns begin with VB
    'VB':VERB,
    # Look for JJ in the POS tag because all nouns begin with JJ
    'JJ' : ADJ,
    # Look for RB in the POS tag because all nouns begin with RB
    'RB':ADV  
}

def get_lemma(no_punct_seq_tokens):
    try:
        if no_punct_seq_tokens:
            normalized_sequence = []
            for each_seq in st.tag_sents(sentences=no_punct_seq_tokens):
                normalized_tokens = []
                for tuples in each_seq:
                    temp = tuples[0]
                    if tuples[1] == "NNP" or tuples[1] == "NNPS":
                        continue
                    if tuples[1][:2] in dict_pos_map.keys():
                        temp = lm.lemmatize(tuples[0].lower(), 
                                            pos=dict_pos_map[tuples[1][:2]])
                    normalized_tokens.append(temp)
                normalized_sequence.append(normalized_tokens)
            return normalized_sequence
        else:
            return None
    except:
        print(no_punct_seq_tokens)
        return None

In [12]:
def pre_process(df, col_list):
    for col in col_list:
        print('Being: ' + col)
        col_tokenized = col + '_tokenized'
        col_normalised = col + '_normalized'
        # tokenizer removes punctuations and stopwords
        df[col_tokenized] = df[col].apply(lambda x : text_tokenizer(x))
        df[col_normalised] = df[col_tokenized].apply(lambda x: get_lemma(x))
        print('End: ' + col)

# Execution

In [None]:
from datetime import datetime

print("Start: " + str(datetime.now()))

pre_process(df_Ind, ['content', 'title', 'description'])

print("End: " + str(datetime.now()))

## Word Cloud Generation

In [None]:
def generate_wordcloud(token_sequences):
    wordcloud = WordCloud(width = 800, height = 800, max_words = 20, normalize_plurals = True,
                background_color ='white').generate(token_sequences)

    # plot the WordCloud image                        
    plt.figure(figsize = (8, 8), facecolor = None) 
    plt.imshow(wordcloud) 
    plt.axis("off") 
    plt.tight_layout(pad = 0) 
    plt.show() 

### Checkpoint - Save the DF to save time

In [None]:
df_Ind.to_pickle('pre_processed_df_ind.pkl')

### Load pkl

In [None]:
#df = pd.read_pickle('pre_processed_df_ind.pkl')

In [None]:
df.head(5)

In [None]:
list_title = [k for i in list(df_Ind['title_normalized']) for j in i for k in j]

In [None]:
stopwords = set(STOPWORDS)
wordcloud = WordCloud(width = 1200, height = 800, max_words = 200, normalize_plurals = True,
            background_color ='white', relative_scaling = 0.5, collocations = False, include_numbers = True,
            stopwords = stopwords).generate(' '.join(list_title))

# plot the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
plt.show() 

In [None]:
wordcloud.words_

In [None]:
def runwordcloud(image, desc):

    # based on example
    # https://github.com/keyonvafa/inaugural-wordclouds/blob/master/create_wordclouds.py
    
    # get data and clean
    print('Getting data...')
    df = pd.DataFrame(get_lines(), columns=['text_raw'])
    
    print('Cleaning data...')
    df = cleandata(df)
    print(df.head(25).to_string())

    print('Number of words', df['text_clean'].apply(lambda x: len(x.split(' '))).sum())

    # import image
    image_mask = np.array(Image.open("images/"+image+".jpeg"))
    image_colors = ImageColorGenerator(image_mask)
    # generate wordcloud
    print('Generating word cloud....')
    wc = WordCloud(background_color="black", width=400, height=400, max_words=2000, #contour_width=1, contour_color='red', 
    mask=image_mask, random_state=1).generate(' '.join(df['text_clean']))
    
    print('Making plot')
    plt.figure(figsize=(20,10))
    ypos = 650
    
    plt.style.use('dark_background')
    #plt.imshow(wc.recolor(color_func=grey_color_func))
    
    # use image colours with white background
    plt.imshow(wc.recolor(color_func=image_colors))

    plt.text(0, ypos, "Moi namesake")
    plt.axis("off")
    plt.savefig('output/wordcloud_'+image+'_'+desc+'.png', dpi=200)