In [None]:
import pandas as pd
from google.colab import drive
drive.mount ('/content/drive', force_remount = True)

Mounted at /content/drive


In [None]:
data = pd.read_excel ('/content/drive/My Drive/Colab Notebooks/news category/Data_Train.xlsx')
data

Unnamed: 0,STORY,SECTION
0,But the most painful was the huge reversal in ...,3.0
1,How formidable is the opposition alliance amon...,0.0
2,Most Asian currencies were trading lower today...,3.0
3,"If you want to answer any question, click on ‘...",1.0
4,"In global markets, gold prices edged up today ...",3.0
...,...,...
7623,"Karnataka has been a Congress bastion, but it ...",0.0
7624,"The film, which also features Janhvi Kapoor, w...",2.0
7625,The database has been created after bringing t...,1.0
7626,"The state, which has had an uneasy relationshi...",0.0


In [None]:
import spacy
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import re
import unicodedata

In [None]:
nltk.download('stopwords')
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')
stopword_list.remove('on')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text


In [None]:
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text

In [None]:
import spacy
def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

In [None]:
nlp = spacy.load('en_core_web_sm')
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [None]:
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [None]:
def normalize_corpus(corpus, contraction_expansion=True,html_stripping=False,
                     accented_char_removal=True, text_lower_case=True, 
                     text_stemming=False, text_lemmatization=True, special_char_removal=True, 
                     stopword_removal=True, remove_digits=True):
    
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        # strip HTML
        if html_stripping:
            doc = strip_html_tags(doc)
        # remove accented characters
        if accented_char_removal:
            doc = remove_accented_chars(doc)
        # lowercase the text    
        if text_lower_case:
            doc = doc.lower()
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # lemmatize text
        if text_lemmatization:
            doc = lemmatize_text(doc)
        #stemming text
        if text_stemming:
            doc=simple_stemmer(doc)
        # remove special characters and\or digits    
        if special_char_removal:
            # insert spaces between special characters to isolate them    
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits)  
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        # remove stopwords
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case=text_lower_case)
            
        normalized_corpus.append(doc)
        
    return normalized_corpus

In [None]:
data.columns

Index(['STORY', 'SECTION', 'cleaned_STORY'], dtype='object')

In [None]:
data['STORY'] = data['STORY'].map(str)
#news_df['full_text'] = news_df["news_headline"].map(str)+ '. ' + news_df["news_article"]

In [None]:
data['cleaned_STORY'] = normalize_corpus(data['STORY'])


In [None]:
data.head()

Unnamed: 0,STORY,SECTION,cleaned_STORY
0,But the most painful was the huge reversal in ...,3.0,painful huge reversal fee income unheard among...
1,How formidable is the opposition alliance amon...,0.0,formidable opposition alliance among congress ...
2,Most Asian currencies were trading lower today...,3.0,asian currency trade low today south korean wi...
3,"If you want to answer any question, click on ‘...",1.0,want answer question click on answer click on ...
4,"In global markets, gold prices edged up today ...",3.0,global market gold price edge today disappoint...


In [None]:
data.to_excel('data.xlsx')
from google.colab import files
files.download('data.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
#data['Disposition']=data['Disposition'].replace(2,0)

In [None]:
data['Disposition'].value_counts()

1    614
0    443
Name: Disposition, dtype: int64

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.12.0-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 4.5 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 17.0 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 45.7 MB/s 
[?25hCollecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.0.19-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 4.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 48.1 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempti

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
#data['Sentiment_le']=le.fit_transform(data['Sentiment'])
data_new=data[['cleaned_STORY','SECTION']]
data_new['SECTION']=le.fit_transform(data_new['SECTION'])
X_train, X_test, y_train, y_test = train_test_split(data_new['cleaned_STORY'],data_new['SECTION'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
ytrain=pd.get_dummies(y_train)
ytest=pd.get_dummies(y_test)
ytrain.head()

Unnamed: 0,0,1,2,3
4600,0,1,0,0
7200,0,0,1,0
7580,1,0,0,0
4169,1,0,0,0
4297,0,0,0,1


In [None]:
data_new['SECTION'].value_counts()

1    2772
2    1924
0    1686
3    1246
Name: SECTION, dtype: int64

In [None]:
!pip install tensorflow_text

Collecting tensorflow_text
  Downloading tensorflow_text-2.8.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 5.3 MB/s 
Collecting tf-estimator-nightly==2.8.0.dev2021122109
  Downloading tf_estimator_nightly-2.8.0.dev2021122109-py2.py3-none-any.whl (462 kB)
[K     |████████████████████████████████| 462 kB 49.8 MB/s 
Installing collected packages: tf-estimator-nightly, tensorflow-text
Successfully installed tensorflow-text-2.8.1 tf-estimator-nightly-2.8.0.dev2021122109


In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [None]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [None]:
def get_sentence_embeding(sentences):
    preprocessed_text = bert_preprocess(sentences)
    return bert_encoder(preprocessed_text)['pooled_output']

In [None]:
# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(150, activation='relu', name="hidden")(l)
l = tf.keras.layers.Dropout(0.1, name="dropout2")(l)
l = tf.keras.layers.Dense(50, activation='relu', name="hidden1")(l)
l = tf.keras.layers.Dropout(0.1, name="dropout1")(l)
l = tf.keras.layers.Dense(15, activation='relu', name="hidden2")(l)
l = tf.keras.layers.Dense(7, activation='relu', name="hidden3")(l)
l = tf.keras.layers.Dense(4, activation='sigmoid', name="output")(l)

# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [l])

In [None]:
METRICS = [
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall')
]

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=METRICS)

In [None]:
ytrain_mod=pd.get_dummies(data_new['SECTION'])

In [None]:
model.fit(data_new['cleaned_STORY'], ytrain_mod, epochs=7,batch_size=50)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.History at 0x7f6d1f037c50>

In [None]:
model.evaluate(X_test,ytest)



[0.09035202860832214,
 0.9716832637786865,
 0.946174144744873,
 0.9402202367782593]

In [None]:
from sklearn.metrics import confusion_matrix
import numpy as np
y_pred=model.predict(X_test)



In [None]:
y_pred[0]

array([3.0868495e-04, 2.1509561e-04, 6.0153368e-05, 9.9941611e-01],
      dtype=float32)

In [None]:
ypred=[np.argmax(i) for i in y_pred]
confusion_matrix(y_test,ypred)

array([[380,   9,   6,   5],
       [  9, 664,  10,  24],
       [ 22,   7, 458,   2],
       [  8,  10,   1, 292]])

Test Data

In [None]:
test=pd.read_excel('/content/drive/My Drive/Colab Notebooks/news category/Data_Test.xlsx')
test.head()

Unnamed: 0,STORY
0,2019 will see gadgets like gaming smartphones ...
1,It has also unleashed a wave of changes in the...
2,It can be confusing to pick the right smartpho...
3,The mobile application is integrated with a da...
4,We have rounded up some of the gadgets that sh...


In [None]:
test['STORY'] = test['STORY'].map(str)
#news_df['full_text'] = news_df["news_headline"].map(str)+ '. ' + news_df["news_article"]
test['cleaned_STORY'] = normalize_corpus(test['STORY'])


In [None]:
y_pred=model.predict(test['cleaned_STORY'])
ypred=[np.argmax(i)  for i in y_pred]

In [None]:
submit=pd.DataFrame(ypred,columns=['SECTION'])
submit.head()

Unnamed: 0,SECTION
0,1
1,2
2,1
3,1
4,1


In [None]:
submit.to_excel('data.xlsx')
from google.colab import files
files.download('data.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>