In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.3-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 5.1 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 27.3 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 66.3 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.21.3


In [2]:
import pandas as pd
import numpy as np
import os

import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from transformers import TFBertModel
import transformers

In [3]:
AUTO = tf.data.experimental.AUTOTUNE
# Configuration
EPOCHS = 3
BATCH_SIZE = 16
MAX_LEN = 256

In [4]:
def bert_encode(texts, tokenizer, max_len=MAX_LEN):
    input_ids = []
    token_type_ids = []
    attention_mask = []
    
    for text in texts:
        token = tokenizer(text, max_length=max_len, truncation=True, padding='max_length',
                         add_special_tokens=True)
        input_ids.append(token['input_ids'])
        token_type_ids.append(token['token_type_ids'])
        attention_mask.append(token['attention_mask'])
    
    return np.array(input_ids), np.array(token_type_ids), np.array(attention_mask)

In [5]:
# First load the real tokenizer
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-cased')
# Save the loaded tokenizer locally
tokenizer.save_pretrained('.')

Downloading vocab.txt:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

('./tokenizer_config.json',
 './special_tokens_map.json',
 './vocab.txt',
 './added_tokens.json')

In [6]:
df=pd.read_csv('/content/data.csv')
df.head(10)

Unnamed: 0,news_title,news_short_description,news_category
0,Portfolios allocated in Maha; CM keeps Urban D...,Portfolios were allocated to Maharashtra minis...,politics
1,Corruption & nepotism are 2 big challenges we ...,PM Narendra Modi in his Independence Day speec...,politics
2,"We have to keep India first, this will pave wa...",Prime Minister Narendra Modi in his Independen...,politics
3,'Totally unacceptable': Kerala Guv on Jaleel's...,Kerala Governor Arif Mohammed Khan on Sunday c...,politics
4,K'taka CM is RSS' slave: Siddaramaiah on exclu...,Congress leader Siddaramaiah called Karnataka ...,politics
5,What is this 'Swachh Bharat' if it cannot rest...,CPI(M) leader Sitaram Yechury has criticised t...,politics
6,Will oppose misrepresentation of historical fa...,While addressing the nation on the 76th Indepe...,politics
7,BJP will say we are distributing freebies: Tej...,Bihar Deputy CM Tejashwi Yadav reacted to CM N...,politics
8,PM misusing partition trauma as fodder for pol...,Congress has alleged that PM Narendra Modi's r...,politics
9,Beti Bachao was a pretence: Cong to BJP over '...,Congress leader Rahul Gandhi on Sunday critici...,politics


In [7]:
df=df[df['news_short_description'].duplicated()==False] # Removing the duplicates from the news_short_subscription data

In [8]:
df=df[df['news_title'].duplicated()==False]

In [9]:
df.describe()

Unnamed: 0,news_title,news_short_description,news_category
count,12569,12569,12569
unique,12569,12569,6
top,Portfolios allocated in Maha; CM keeps Urban D...,Portfolios were allocated to Maharashtra minis...,politics
freq,1,1,2645


In [10]:
df['news_category'].unique()

array(['politics', 'news_category', 'business', 'entertainment',
       'technology', 'sports'], dtype=object)

In [11]:
df=df[df['news_category']!='news_category'] #Removing news_category from data

In [12]:
df['news_category'].value_counts()

politics         2645
entertainment    2645
sports           2633
technology       2352
business         2293
Name: news_category, dtype: int64

In [13]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df['news_category']=le.fit_transform(df['news_category'])

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(df['news_short_description'], df['news_category'], test_size=0.12, random_state=42)

In [15]:
X_train = bert_encode(X_train.astype(str), tokenizer)
X_valid = bert_encode(X_valid.astype(str), tokenizer)

y_train = y_train.values
y_valid = y_valid.values

In [16]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((X_train, y_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((X_valid, y_valid))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

In [17]:
def build_model(bert_model, max_len=MAX_LEN):    
    input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
    token_type_ids = Input(shape=(max_len,), dtype=tf.int32, name="token_type_ids")
    attention_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")

    sequence_output = bert_model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)[0]
    clf_output = sequence_output[:, 0, :]
    clf_output = Dropout(.1)(clf_output)
    out = Dense(5, activation='softmax')(clf_output)
    
    model = Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=out)
    model.compile(Adam(lr=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [18]:
%%time
transformer_layer = (TFBertModel.from_pretrained('bert-base-cased'))
model = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()

Downloading tf_model.h5:   0%|          | 0.00/502M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 256)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 256)]        0           []                               
                                                                                                  
 token_type_ids (InputLayer)    [(None, 256)]        0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]',     

  super(Adam, self).__init__(name, **kwargs)


In [19]:
train_history = model.fit(
    train_dataset,
    steps_per_epoch=200,
    validation_data=valid_dataset,
    epochs=10
)

Epoch 1/10




Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [20]:
model.save('/content/sample_data/news', include_optimizer=False)



In [25]:
new_df=pd.DataFrame([{'text':'The CBI denied Delhi Deputy Chief Minister Manish Sisodia claims that a CBI officer died by suicide due to pressure to frame the AAP leader in the Delhi Excise Policy scam case. We strongly refute mischievous and misleading statement by Sisodia. Its clarified that gentleman officer late Jitendra Kumar was in no way connected with the investigation of the case.'},
                     {'text':'The Competition Commission of India (CCI) has approved the $4.7-billion acquisition of payment gateway BillDesk by Prosus PayU. It is said to be the largest acquisition in Indias digital payments space. The acquisition was announced in August last year. With the deal, PayU will now operate in more than 20 markets.'},
                     {'text':'Ex-Australia captain Ricky Ponting has named the first five players he would choose in his world T20I XI. He included Afghanistans Rashid Khan, Pakistans Babar Azam, Indias Hardik Pandya and Jasprit Bumrah, and Englands Jos Buttler. Right now hes probably the best all-rounder in the world in T20 cricket, and could potentially be in ODI cricket, Ponting said about Pandya.'},
                     {'text':'Team India fast bowler Jasprit Bumrahs wife, TV presenter Sanjana Ganesan, responded to a troll who commented on her Instagram picture with Bumrah. "Waha India ki *** ***** pade hue hai or yeh gum rahe," the troll wrote, mocking Bumrah for not being in India squad for Asia Cup. Sanjana responded, Its a throwback photo, cant you see, chomu aadmi?'},
                     {'text':'Filmmaker Rakesh Roshan said in an interview that SS Rajamoulis Baahubali was "very similar" to his 1995 film Karan Arjun."But it was presented on a bigger scale. Even...songs were larger-than-life and hence people were enticed, he added. In...South, theyre still sticking to rooted stories, he said, adding that Bollywood filmmakers have drifted away from the roots of Indianness."'},
                     {'text':'Worlds richest man Elon Musk tweeted that comedian Chris Rock has invited him to open one of his shows. Several Twitter users encouraged him with tweets like, "Youll do well, Elon...Youre pretty hilarious, and I always thought you were a stand-up guy, Elon! A user also tweeted no slapping, referring to Chris being slapped by Will Smith at the Oscars.'},
                     {'text':'Shares of Paytm’s parent company One 97 Communications declined over 6% in Mondays early trade. The Enforcement Directorate had conducted a raid at the companys Bengaluru premises over the weekend in connection with a Chinese loan apps case. At the days low of ₹681.20, the share price was trading at a discount of 68% from its issue price of ₹2,150.'}
                     ])
new_df.head()


new_df_text= bert_encode(new_df.text.astype(str), tokenizer)
x=model.predict(new_df_text)

In [26]:
for i in range(len(x)):
  for j in range(len(x[i])):
    if(x[i].max()==x[i][j]):
      if(j==0):
        print(x[i].max(),'business')
      elif(j==1):
        print(x[i].max(),'entertainment')
      elif(j==2):
        print(x[i].max(),'politics')
      elif(j==3):
        print(x[i].max(),'sports')
      elif(j==4):
        print(x[i].max(),'technology')

0.9997086 politics
0.98486173 business
0.99999595 sports
0.9999796 sports
0.99994135 entertainment
0.80464536 technology
0.9977938 business
