# Bengali News Classifier using Pretrained Embeddings

In this notebook, we will build a TensorFlow model to predict news classes based on their headline using embeddings that we trained with news articles.

First we import all the required packages

In [1]:
import numpy as np
import os
import re
import json
import string
from collections import Counter
from tqdm import tqdm

import pandas as pd

In [2]:
vocab=[]

symbols = {0: 'PAD',1: 'UNK'}

In [3]:
def load_embeddings(filepath, vocab, dim):

    word_vocab =[]
    embedding_matrix = []
    word_vocab.extend(['PAD','UNK'])
    embedding_matrix.append(np.random.uniform(-1.0, 1.0, (1,dim))[0])
    embedding_matrix.append(np.random.uniform(-1.0, 1.0, (1,dim))[0])

    
    with open(filepath, 'r') as f:
        for i, line in enumerate(f):
            if i==0:
                continue
            if line.split()[0] in vocab:
                word_vocab.append(line.split()[0].strip())
                embedding_matrix.append(np.asarray(line.split()[1:], dtype='float32'))
                
        
    return {'word_vocab': word_vocab,'embedding_matrix': np.reshape(embedding_matrix,[-1,dim]).astype(np.float32)}

In [4]:
def extract_text(filenames):
    
    extracted_title=[]
    extracted_cls=[]
    
    for filename in filenames:
        with open(os.path.join('data', filename), 'r') as f:
            articles=json.load(f)

        for article in articles['articles']:
            extracted_title.append(article['title'].strip())
            extracted_cls.append(article['label'].strip())
    
    return extracted_title, extracted_cls

In [5]:
def read_data(filename):
    cls=[]
    text=[]

    with open(os.path.join('data', filename+'.txt'), 'r') as f:
        for line in f:
            cls.append(line.split('||')[0])
            text.append(line.split('||')[1])
    
    return cls, text

In [6]:
def remove_punc(sentences):
    new_sentences=[]
    exclude = list(set(string.punctuation))
    exclude.extend(["’", "‘", "—"])
    for sentence in sentences:
        s = ''.join(ch for ch in sentence if ch not in exclude)
        new_sentences.append(s)
    
    return new_sentences

In [7]:
def replace_strings(texts, replace):
    new_texts=[]
    
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    english_pattern=re.compile('[a-zA-Z0-9]+', flags=re.I)
    
    for text in tqdm(texts):
        for r in replace:
            text=text.replace(r[0], r[1])
        text=emoji_pattern.sub(r'', text)
        text=english_pattern.sub(r'', text)
        text=re.sub(r'\s+', ' ', text).strip()
        new_texts.append(text)

    return new_texts

In [8]:
def split(df):
    cols=df.cls.drop_duplicates().values
    from sklearn.model_selection import train_test_split
    import sklearn

    train=pd.DataFrame()
    test=pd.DataFrame()

    for col in cols:
        split_df=df[df.cls == col]
        train_df, test_df= train_test_split(split_df, test_size=0.2)
        train=[train, train_df]
        test=[test, test_df]
        train=pd.concat(train)
        test=pd.concat(test)
    
    return sklearn.utils.shuffle(train), sklearn.utils.shuffle(test)

In [9]:
def remove_common(sentences, unique_tokens, top):
    new_sentences=[]
    common=Counter(unique_tokens).most_common(top)
    common=list(list(zip(*common))[0])

    for sentence in tqdm(sentences):
        sentence=sentence.split()
        words=[word for word in sentence if word not in common]
        new_sentences.append(' '.join(words).strip())
    
    return new_sentences

In [10]:
titles, cls=extract_text(['zeenews_articles.txt', 'anandabazar_articles.txt', 'ebala_articles.txt'])
print(titles[:10])
print(cls[:10])
replace=[('\u200c', ' '),
         ('\u200d', ' '),
        ('\xa0', ' '),
        ('\n', ' '),
        ('\r', ' ')]

titles=remove_punc(titles)
titles=replace_strings(titles, replace)
print(titles[:10])


labels, text=read_data('classification')

yo=read_data('anandabazar_classification')
labels.extend(yo[0])
text.extend(yo[1])

yo=read_data('ebala_classification')
labels.extend(yo[0])
text.extend(yo[1])

text=remove_punc(text)

cls.extend(labels)
titles.extend(text)

yo=list(set(zip(cls, titles)))
print(len(yo))

# Removing Travel and World class as they are low in number
df=pd.DataFrame(yo, columns=['cls', 'titles'])
df=df.replace(['international', 'sport', 'nation'], ['world', 'sports', 'national'])
df=df[df.cls!='travel']
df=df[df.cls!='world']

vocab=[]

titles=df.titles.values
cls=df.cls.values

print(Counter(df.cls.values))

for sentence in titles:
    for word in sentence.split():
        vocab.append(word.lower())

df['titles']=remove_common(titles, vocab, 20)

df=df[df['titles'].str.len()>2]
df=df.drop_duplicates()

vocab = list(set(vocab))
common=Counter(vocab).most_common(50000)
vocab=list(list(zip(*common))[0])

embeddings=load_embeddings('expanded_news_vec.txt', vocab, 200)

['বিশ্বসাথে যোগে যেথায় বিহারো', 'কফি হাউসে ধূমপান নিয়ে বচসায় গ্রাহককে মারধরের অভিযোগ কর্মীর বিরুদ্ধে', 'ফের বদলি আমলা, এবার কোপে সুন্দরবন উন্নয়ন দফতরের সচিব', 'বৈশাখী নন, অভিষেকের কারণেই মন্ত্রিত্ব ছেড়েছেন শোভন: মুকুল', 'বড়দিনে উধাও শীত, পারদ ঊর্ধ্বমুখী', 'স্বাদে স্বাধীনতা', 'অথ সিন্ডিকেট কথা...', 'জিসটিতে জোর ধাক্কা বাংলার মিষ্টি শিল্পে', 'অসমাপ্ত গল্প', 'দেশকে বাঁচানোর দায়িত্ব নিতে হবে ছাত্রসমাজকেই: মমতা বন্দ্যোপাধ্যায়']
['kolkata', 'kolkata', 'kolkata', 'kolkata', 'kolkata', 'kolkata', 'kolkata', 'kolkata', 'kolkata', 'kolkata']


100%|██████████| 14205/14205 [00:00<00:00, 77585.31it/s]


['বিশ্বসাথে যোগে যেথায় বিহারো', 'কফি হাউসে ধূমপান নিয়ে বচসায় গ্রাহককে মারধরের অভিযোগ কর্মীর বিরুদ্ধে', 'ফের বদলি আমলা এবার কোপে সুন্দরবন উন্নয়ন দফতরের সচিব', 'বৈশাখী নন অভিষেকের কারণেই মন্ত্রিত্ব ছেড়েছেন শোভন মুকুল', 'বড়দিনে উধাও শীত পারদ ঊর্ধ্বমুখী', 'স্বাদে স্বাধীনতা', 'অথ সিন্ডিকেট কথা', 'জিসটিতে জোর ধাক্কা বাংলার মিষ্টি শিল্পে', 'অসমাপ্ত গল্প', 'দেশকে বাঁচানোর দায়িত্ব নিতে হবে ছাত্রসমাজকেই মমতা বন্দ্যোপাধ্যায়']
55794
Counter({'state': 14274, 'sports': 12780, 'kolkata': 10133, 'entertainment': 7737, 'national': 7569})


100%|██████████| 52493/52493 [00:00<00:00, 161397.67it/s]


In [11]:
load_embedding_matrix = embeddings['embedding_matrix']
shape_word_vocab = embeddings['word_vocab']

int_to_vocab = {}

for index_no,word in enumerate(shape_word_vocab):
    int_to_vocab[index_no] = word
int_to_vocab.update(symbols)


vocab_to_int = {word:index_no for index_no , word in int_to_vocab.items()}

print(len(shape_word_vocab))

print(len(load_embedding_matrix))

36005
36005


In [12]:
def encode_data(titles):
    encoded_data = []

    for sentence in titles:
        sentence_ =[]
        for word in sentence.split():
            if word.lower() in vocab_to_int:
                sentence_.append(vocab_to_int[word.lower()])
            else:
                sentence_.append(vocab_to_int['UNK'])
        encoded_data.append(sentence_)

    return encoded_data

In [13]:
train_df, test_df=split(df)

train_label=pd.Series(train_df.cls).str.get_dummies().values
test_label=pd.Series(test_df.cls).str.get_dummies().values

train_encoded_data=encode_data(train_df.titles.values)
test_encoded_data=encode_data(test_df.titles.values)

['state' 'state' 'sports' 'sports' 'state' 'entertainment' 'national'
 'state' 'state' 'sports']


In [17]:
import tensorflow as tf
tf.reset_default_graph()

class LSTM:
    def __init__(self, train):

        sentence  = tf.placeholder(name='input_sentence',shape=[None,None],dtype=tf.int32)
        cls_= tf.placeholder(name='cls',shape=[None, 5],dtype=tf.int32)
        y_true_cls = tf.argmax(cls_, axis=1)

        self.placeholders = {'sentence':sentence,'cls':cls_}

        Word_embedding = tf.get_variable(name="Word_embedding", 
                                     shape=[len(vocab),200], 
                                     initializer=tf.constant_initializer(np.array(load_embedding_matrix)), 
                                     trainable=train
                                    )


        embedding_lookup= tf.nn.embedding_lookup(Word_embedding,sentence)

        sequence_leng = tf.count_nonzero(sentence,axis=1)

        with tf.variable_scope('forward'):
            fr_cell = tf.contrib.rnn.LSTMCell(num_units=128)
#             dropout_fr = tf.contrib.rnn.DropoutWrapper(fr_cell)

        with tf.variable_scope('backward'):
            bw_cell = tf.contrib.rnn.LSTMCell(num_units=128)
#             dropout_bw = tf.contrib.rnn.DropoutWrapper(bw_cell)

        with tf.variable_scope('encoder') as scope:
            model,last_state = tf.nn.bidirectional_dynamic_rnn(fr_cell, bw_cell, inputs=embedding_lookup,
                                                               sequence_length=np.zeros(32)+15,
                                                               dtype=tf.float32)

        concat_output = tf.concat([last_state[0].c,last_state[1].c],axis=-1)

        net = tf.layers.dense(inputs=concat_output, name='layer_fc1',
                      units=1024, activation=tf.nn.relu)
        net = tf.layers.dense(inputs=net, name='layer_fc2',
                      units=512, activation=tf.nn.relu)
        net = tf.layers.dense(inputs=net, name='layer_fc3',
                      units=128, activation=tf.nn.relu)
        net = tf.layers.dense(inputs=net, name='layer_fc_out',
                      units=5, activation=None)
        
        logits = net

        #prediction
        probability = tf.nn.softmax(logits)
        prediction  = tf.argmax(probability, axis=1)

        cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logits,labels=cls_)

        cost = tf.reduce_mean(cross_entropy)
        optimizer = tf.train.AdamOptimizer().minimize(cost)

        #accuracy
        accuracy = tf.reduce_mean(tf.cast(tf.equal(prediction, y_true_cls), tf.float32))
#         f1=tf.contrib.metrics.f1_score(y_true_cls, prediction)

        self.acc={'accuracy':accuracy, 'cls':y_true_cls, 'cost':cost}
        self.output = {'loss':cost,'accuracy':accuracy,'logits': logits,'check1':embedding_lookup,'check2':model}
        self.train = optimizer

In [18]:
def get_batch(batch):
    x=[]
    y=[]
    
    for item in batch:
        if len(item[0])>0:
            x.append(item[0])
            y.append(item[1])
    from keras.preprocessing import sequence
    x=sequence.pad_sequences(x, maxlen=15)
            
    return x, y

In [19]:
training_data=list(zip(train_encoded_data, train_label))

tot=int(len(training_data)/32)
batched_train=[training_data[32*i : 32*(i+1)] for i in range(tot)]


testing_data=list(zip(test_encoded_data, test_label))

tot=int(len(testing_data)/32)
batched_test=[testing_data[32*i : 32*(i+1)] for i in range(tot)]


accuracy=0
cost=0
f1=0
train_accuracy=0
train_cost=0


tf.reset_default_graph()

with tf.Session() as sess:
    model=LSTM(train=False)
    sess.run(tf.global_variables_initializer())
    
    for i in range(5):
        for batch in batched_train:
            x, y=get_batch(batch)
            
            if len(x)!=32:
                print(len(x))
                continue

            sess.run(model.train,feed_dict={model.placeholders['sentence']: np.reshape(x,[-1,15]), model.placeholders['cls']:np.reshape(y,[-1,5])})
            train_acc=sess.run(model.acc,feed_dict={model.placeholders['sentence']: np.reshape(x,[-1,15]), model.placeholders['cls']:np.reshape(y,[-1,5])})
            train_accuracy=train_accuracy+train_acc['accuracy']
            train_cost=train_cost+train_acc['cost']

        print(f"Training: Epoch: {i+1}, Cost: {train_cost/len(batched_train)}, Acc: {train_accuracy/len(batched_train)}")
        train_accuracy=0
        train_cost=0
        for batch in batched_test:
            x, y=get_batch(batch)
            if len(x)!=32:
                print(len(x))
                continue
            acc=sess.run(model.acc,feed_dict={model.placeholders['sentence']: np.reshape(x,[-1,15]), model.placeholders['cls']:np.reshape(y,[-1,5])})
            accuracy=accuracy+acc['accuracy']
            cost=cost+acc['cost']

        print(f"Testing:  Epoch: {i+1}, Cost: {cost/len(batched_test)}, Acc: {accuracy/len(batched_test)}")
        accuracy=0
        cost=0
        f1=0

Training: Epoch: 1, Cost: 1.0021956870770892, Acc: 0.5933010774142059
Testing:  Epoch: 1, Cost: 0.922282394700157, Acc: 0.6320886581469649
Training: Epoch: 2, Cost: 0.8590483333430857, Acc: 0.6587190742218675
Testing:  Epoch: 2, Cost: 0.8717126891064567, Acc: 0.6588458466453674
Training: Epoch: 3, Cost: 0.7743864111774746, Acc: 0.6957551875498803
Testing:  Epoch: 3, Cost: 0.8767227778038659, Acc: 0.6613418530351438
Training: Epoch: 4, Cost: 0.694750135147562, Acc: 0.7277783320031923
Testing:  Epoch: 4, Cost: 0.8941127435087015, Acc: 0.6612420127795527
Training: Epoch: 5, Cost: 0.6098273073732615, Acc: 0.7636671987230647
Testing:  Epoch: 5, Cost: 0.9727417603849222, Acc: 0.652555910543131


Let's see if the accuracy increasing by fine-tuning the embeddings

In [20]:
accuracy=0
cost=0
f1=0
train_accuracy=0
train_cost=0


tf.reset_default_graph()

with tf.Session() as sess:
    model=LSTM(train=True)
    sess.run(tf.global_variables_initializer())
    
    for i in range(5):
        for batch in batched_train:
            x, y=get_batch(batch)
            
            if len(x)!=32:
                print(len(x))
                continue

            sess.run(model.train,feed_dict={model.placeholders['sentence']: np.reshape(x,[-1,15]), model.placeholders['cls']:np.reshape(y,[-1,5])})
            train_acc=sess.run(model.acc,feed_dict={model.placeholders['sentence']: np.reshape(x,[-1,15]), model.placeholders['cls']:np.reshape(y,[-1,5])})
            train_accuracy=train_accuracy+train_acc['accuracy']
            train_cost=train_cost+train_acc['cost']

        print(f"Training: Epoch: {i+1}, Cost: {train_cost/len(batched_train)}, Acc: {train_accuracy/len(batched_train)}")
        train_accuracy=0
        train_cost=0
        for batch in batched_test:
            x, y=get_batch(batch)
            if len(x)!=32:
                print(len(x))
                continue
            acc=sess.run(model.acc,feed_dict={model.placeholders['sentence']: np.reshape(x,[-1,15]), model.placeholders['cls']:np.reshape(y,[-1,5])})
            accuracy=accuracy+acc['accuracy']
            cost=cost+acc['cost']

        print(f"Testing:  Epoch: {i+1}, Cost: {cost/len(batched_test)}, Acc: {accuracy/len(batched_test)}")
        accuracy=0
        cost=0
        f1=0

Training: Epoch: 1, Cost: 0.8755560705448662, Acc: 0.6506883479648843
Testing:  Epoch: 1, Cost: 0.7434038513194258, Acc: 0.7091653354632588
Training: Epoch: 2, Cost: 0.4329615601079043, Acc: 0.8397845171588189
Testing:  Epoch: 2, Cost: 0.7860060959768752, Acc: 0.7170527156549521
Training: Epoch: 3, Cost: 0.2055264384536324, Acc: 0.9283719074221868
Testing:  Epoch: 3, Cost: 1.0693155011049094, Acc: 0.6981829073482428
Training: Epoch: 4, Cost: 0.11644200213743687, Acc: 0.9612679569034318
Testing:  Epoch: 4, Cost: 1.3170173526191102, Acc: 0.6962859424920128
Training: Epoch: 5, Cost: 0.07428865747861504, Acc: 0.9754090183559457
Testing:  Epoch: 5, Cost: 1.734269525201176, Acc: 0.6963857827476039
