In [1]:
import pandas as pd
import numpy as np
import nltk
import tqdm

In [2]:
train = pd.read_excel('E:/MH_text-catagorization using NLP/Data_Train.xlsx')
test = pd.read_excel('E:/MH_text-catagorization using NLP/Data_Test.xlsx')
submission = pd.read_excel('E:/MH_text-catagorization using NLP/Sample_submission.xlsx')

In [3]:
train.head()

Unnamed: 0,STORY,SECTION
0,But the most painful was the huge reversal in ...,3
1,How formidable is the opposition alliance amon...,0
2,Most Asian currencies were trading lower today...,3
3,"If you want to answer any question, click on ‘...",1
4,"In global markets, gold prices edged up today ...",3


In [4]:
test.head()

Unnamed: 0,STORY
0,2019 will see gadgets like gaming smartphones ...
1,It has also unleashed a wave of changes in the...
2,It can be confusing to pick the right smartpho...
3,The mobile application is integrated with a da...
4,We have rounded up some of the gadgets that sh...


In [5]:
train.columns

Index(['STORY', 'SECTION'], dtype='object')

In [6]:
train['SECTION'].value_counts()

1    2772
2    1924
0    1686
3    1246
Name: SECTION, dtype: int64

#Build Train and Test Datasets

In [7]:
# build train and test datasets

train_STORY = train['STORY'].values
train_SECTION = train['SECTION'].values

test_STORY = test['STORY'].values

In [8]:
train_STORY

array(['But the most painful was the huge reversal in fee income, unheard of among private sector lenders. Essentially, it means that Yes Bank took it for granted that fees on structured loan deals will be paid and accounted for upfront on its books. As borrowers turned defaulters, the fees tied to these loan deals fell off the cracks. Gill has now vowed to shift to a safer accounting practice of amortizing fee income rather than booking these upfront.\n\n\nGill’s move to mend past ways means that there will be no nasty surprises in the future. This is good news considering that investors love a clean image and loathe uncertainties.\n\n\nBut there is no gain without pain and the promise of a strong and stable balance sheet comes with some sacrifices as well. Investors will have to give up the hopes of phenomenal growth, a promise made by Kapoor.',
       'How formidable is the opposition alliance among Congress, Jharkhand Mukti Morcha (JMM) and Jharkhand Vikas Morcha (Prajatantrik)?',


In [9]:
train_SECTION

array([3, 0, 3, ..., 1, 0, 2], dtype=int64)

In [10]:
test_STORY

array(['2019 will see gadgets like gaming smartphones and wearable medical devices lifting the user experience to a whole new level\n\n\nmint-india-wire consumer technologyconsumer technology trends in New Yeartech gadgetsFoldable phonesgaming smartphoneswearable medical devicestechnology\n\n\nNew Delhi: Gadgets have become an integral part of our lives with most of us relying on some form of factor to communicate, commute, work, be informed or entertained. Year 2019 will see some gadgets lifting the user experience to a whole new level. Here’s what we can expect to see:\n\n\nSmartphones with foldable screens: Foldable phones are finally moving from the concept stage to commercial launches. They are made up of organic light-emitting diode (OLED) panels with higher plastic substrates, allowing them to be bent without damage.\n\n\nUS-based display maker Royole Corp’s foldable phone, FlexPai, has already arrived in select markets, while Samsung’s unnamed foldable phone is expected sometim

In [11]:
sub_section = submission['SECTION']

## Text Wrangling & Normalization

In [12]:
##conda install pyahocorasick

In [13]:
##!pip install contractions

In [14]:
import contractions
from bs4 import BeautifulSoup
import numpy as np
import re
import tqdm
import unicodedata


def strip_html_tags(text):
  soup = BeautifulSoup(text, "html.parser")
  [s.extract() for s in soup(['iframe', 'script'])]
  stripped_text = soup.get_text()
  stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
  return stripped_text

def remove_accented_chars(text):
  text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
  return text

def pre_process_corpus(docs):
  norm_docs = []
  for doc in tqdm.tqdm(docs):
    doc = strip_html_tags(doc)
    doc = doc.translate(doc.maketrans("\n\t\r", "   "))
    doc = doc.lower()
    doc = remove_accented_chars(doc)
    doc = contractions.fix(doc)
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I|re.A)
    doc = re.sub(' +', ' ', doc)
    doc = doc.strip()  
    norm_docs.append(doc)
  
  return norm_docs

In [15]:
%%time

norm_train_story = pre_process_corpus(train_STORY)
norm_test_story = pre_process_corpus(test_STORY)

100%|████████████████████████████████████████████████████████████████████████████| 7628/7628 [00:03<00:00, 2269.00it/s]
100%|████████████████████████████████████████████████████████████████████████████| 2748/2748 [00:01<00:00, 1645.43it/s]

Wall time: 5.04 s






# Traditional Supervised Machine Learning Models
## feature Engineering

In [16]:
%%time

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# build BOW features on train reviews
cv = CountVectorizer(binary=False, min_df=5, max_df=1.0, ngram_range=(1,2))
cv_train_features = cv.fit_transform(norm_train_story)


# build TFIDF features on train reviews
tv = TfidfVectorizer(use_idf=True, min_df=5, max_df=1.0, ngram_range=(1,2),
                     sublinear_tf=True)
tv_train_features = tv.fit_transform(norm_train_story)


Wall time: 7.33 s


In [17]:
%%time

# transform test reviews into features
cv_test_features = cv.transform(norm_test_story)
tv_test_features = tv.transform(norm_test_story)

Wall time: 2.05 s


In [18]:

print('BOW model:> Train features shape:', cv_train_features.shape, ' Test features shape:', cv_test_features.shape)
print('TFIDF model:> Train features shape:', tv_train_features.shape, ' Test features shape:', tv_test_features.shape)

BOW model:> Train features shape: (7628, 31941)  Test features shape: (2748, 31941)
TFIDF model:> Train features shape: (7628, 31941)  Test features shape: (2748, 31941)



# Model Training, Prediction and Performance Evaluation
## Try out Logistic Regression
The logistic regression model is actually a statistical model developed by statistician David Cox in 1958. It is also known as the logit or logistic model since it uses the logistic (popularly also known as sigmoid) mathematical function to estimate the parameter values. These are the coefficients of all our features such that the overall loss is minimized when predicting the outcome—

In [19]:
%%time

# Logistic Regression model on BOW features
from sklearn.linear_model import LogisticRegression

# instantiate model
lr = LogisticRegression(penalty='l2', max_iter=500, C=1, solver='lbfgs', random_state=42)

# train model
lr.fit(cv_train_features, train_SECTION)

# predict on test data
lr_bow_predictions = lr.predict(cv_test_features)

Wall time: 13.2 s


In [20]:
lr_bow_predictions

array([1, 2, 1, ..., 1, 3, 1], dtype=int64)

In [22]:
# download from colab

In [86]:
df_lr = pd.DataFrame (lr_bow_predictions)
submission['SECTION'] = df_lr.values
filepath = 'MH_news-text_catagorzation_LR.xlsx'
submission.to_excel(filepath, index= False)

In [24]:
#from google.colab import files
#files.download('MH_news-text_catagorzation_LR.xlsx')

# Newer Supervised Deep Learning Models

In [25]:
import gensim
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout, Activation, Dense
from sklearn.preprocessing import LabelEncoder

In [26]:
labels = ['Politics', 'Technology', 'Entertainment', 'Business']

# Prediction class label encoding

In [27]:
le = LabelEncoder()
# tokenize train reviews & encode train labels
tokenized_train = [nltk.word_tokenize(text)
                       for text in norm_train_story]
y_train = le.fit_transform(train_SECTION)
# tokenize test reviews & encode test labels
tokenized_test = [nltk.word_tokenize(text)
                       for text in norm_test_story]
y_test = le.fit_transform(sub_section)


In [28]:

# print class label encoding map and encoded labels
print('section class label map:', dict(zip(le.classes_, le.transform(le.classes_))))
print('Sample test label transformation:\n'+'-'*35,
      '\nActual Labels:', sub_section, '\nEncoded Labels:', y_test[:3])

section class label map: {0: 0, 1: 1, 2: 2, 3: 3}
Sample test label transformation:
----------------------------------- 
Actual Labels: 0       1
1       2
2       1
3       0
4       1
       ..
2743    1
2744    1
2745    1
2746    3
2747    1
Name: SECTION, Length: 2748, dtype: int64 
Encoded Labels: [1 2 1]


In [29]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from gensim.test.utils import common_texts
from gensim.models import Word2Vec

2021-04-16 23:27:22,284 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2021-04-16 23:27:22,286 : INFO : built Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) from 9 documents (total 29 corpus positions)
2021-04-16 23:27:22,290 : INFO : Dictionary lifecycle event {'msg': "built Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) from 9 documents (total 29 corpus positions)", 'datetime': '2021-04-16T23:27:22.290408', 'gensim': '4.0.1', 'python': '3.8.5 (default, Sep  3 2020, 21:29:08) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'created'}


In [38]:

%%time
# build word2vec model
w2v_model = gensim.models.Word2Vec(tokenized_train, window=150,
                                   min_count=10, workers=4)

2021-04-16 23:38:28,311 : INFO : collecting all words and their counts
2021-04-16 23:38:28,313 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-04-16 23:38:28,678 : INFO : collected 40121 word types from a corpus of 821529 raw words and 7628 sentences
2021-04-16 23:38:28,680 : INFO : Creating a fresh vocabulary
2021-04-16 23:38:28,757 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=10 retains 6846 unique words (17.063383265621496%% of original 40121, drops 33275)', 'datetime': '2021-04-16T23:38:28.757422', 'gensim': '4.0.1', 'python': '3.8.5 (default, Sep  3 2020, 21:29:08) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'prepare_vocab'}
2021-04-16 23:38:28,759 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=10 leaves 746491 word corpus (90.86605585438859%% of original 821529, drops 75038)', 'datetime': '2021-04-16T23:38:28.759417', 'gensim': '4.0.1', 'python': '3.8.5 (default, Sep  3 2020, 21:

Wall time: 14.8 s


In [41]:
def averaged_word2vec_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index_to_key)
    
    def average_word_vectors(words, model, vocabulary, num_features):
        feature_vector = np.zeros((num_features,), dtype="float64")
        nwords = 0.
        
        for word in words:
            if word in vocabulary: 
                nwords = nwords + 1.
                feature_vector = np.add(feature_vector, model.wv[word])
        if nwords:
            feature_vector = np.divide(feature_vector, nwords)

        return feature_vector

    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)

In [44]:
w2v_num_features = 100
# generate averaged word vector features from word2vec model
avg_wv_train_features = averaged_word2vec_vectorizer(corpus=tokenized_train, model=w2v_model,
                                                     num_features=w2v_num_features)
avg_wv_test_features = averaged_word2vec_vectorizer(corpus=tokenized_test, model=w2v_model,
                                                    num_features=w2v_num_features)

In [45]:
print('Word2Vec model:> Train features shape:', avg_wv_train_features.shape, ' Test features shape:', avg_wv_test_features.shape)


Word2Vec model:> Train features shape: (7628, 100)  Test features shape: (2748, 100)


# Modeling with deep neural networks
## Building Deep neural network architecture

In [65]:
from keras.layers import BatchNormalization

In [66]:
def construct_deepnn_architecture(num_input_features):
    dnn_model = Sequential()
    dnn_model.add(Dense(512, input_shape=(num_input_features,), kernel_initializer='he_normal'))
    dnn_model.add(BatchNormalization())
    dnn_model.add(Activation('elu'))
    dnn_model.add(Dropout(0.2))
    
    dnn_model.add(Dense(256, kernel_initializer='he_normal'))
    dnn_model.add(BatchNormalization())
    dnn_model.add(Activation('elu'))
    dnn_model.add(Dropout(0.2))
    
    dnn_model.add(Dense(256, kernel_initializer='he_normal'))
    dnn_model.add(BatchNormalization())
    dnn_model.add(Activation('elu'))
    dnn_model.add(Dropout(0.2))
    
    dnn_model.add(Dense(4))
    dnn_model.add(Activation('softmax'))

    dnn_model.compile(loss='categorical_crossentropy', optimizer='adam',                 
                      metrics=['accuracy'])
    return dnn_model

In [67]:
w2v_dnn = construct_deepnn_architecture(num_input_features=w2v_num_features)

In [68]:
w2v_dnn.summary()


Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_8 (Dense)              (None, 512)               51712     
_________________________________________________________________
batch_normalization_3 (Batch (None, 512)               2048      
_________________________________________________________________
activation_8 (Activation)    (None, 512)               0         
_________________________________________________________________
dropout_6 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 256)               131328    
_________________________________________________________________
batch_normalization_4 (Batch (None, 256)               1024      
_________________________________________________________________
activation_9 (Activation)    (None, 256)              

# Model Training, Prediction and Performance Evaluation

In [72]:
import keras
import keras.utils
from keras import utils as np_utils
from keras.utils import to_categorical

In [73]:
to_categorical(train['SECTION'])

array([[0., 0., 0., 1.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       ...,
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.]], dtype=float32)

In [74]:
batch_size = 100
w2v_dnn.fit(avg_wv_train_features, to_categorical(train['SECTION']), epochs=50, batch_size=batch_size, 
            shuffle=True, validation_split=0.1, verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x159efe57d90>

In [75]:
y_pred = w2v_dnn.predict_classes(avg_wv_test_features)
predictions = le.inverse_transform(y_pred)

In [85]:
df_dnn = pd.DataFrame (predictions)
submission['SECTION'] = df_dnn.values
filepath = 'MH_news-text_catagorzation_DNN.xlsx'
submission.to_excel(filepath, index= False)

# Implement LSTM


In [77]:
import tensorflow as tf

t = tf.keras.preprocessing.text.Tokenizer(oov_token='<UNK>')
# fit the tokenizer on the documents
t.fit_on_texts(norm_train_story)
t.word_index['<PAD>'] = 0

In [78]:
VOCAB_SIZE = len(t.word_index)

In [79]:
train_sequences = t.texts_to_sequences(norm_train_story)
test_sequences = t.texts_to_sequences(norm_test_story)
X_train = tf.keras.preprocessing.sequence.pad_sequences(train_sequences, maxlen=1000)
X_test = tf.keras.preprocessing.sequence.pad_sequences(test_sequences, maxlen=1000)

In [80]:
EMBEDDING_DIM = 300 # dimension for dense embeddings for each token
LSTM_DIM = 128 # total LSTM units

model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, input_length=1000))
model.add(tf.keras.layers.SpatialDropout1D(0.1))
model.add(tf.keras.layers.LSTM(LSTM_DIM, return_sequences=False))
model.add(tf.keras.layers.Dense(256, activation='relu'))
model.add(tf.keras.layers.Dense(4, activation="softmax"))

model.compile(loss="categorical_crossentropy", optimizer="adam",
              metrics=["accuracy"])
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1000, 300)         12036900  
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 1000, 300)         0         
_________________________________________________________________
lstm (LSTM)                  (None, 128)               219648    
_________________________________________________________________
dense_12 (Dense)             (None, 256)               33024     
_________________________________________________________________
dense_13 (Dense)             (None, 4)                 1028      
Total params: 12,290,600
Trainable params: 12,290,600
Non-trainable params: 0
_________________________________________________________________


In [81]:
batch_size = 100
model.fit(X_train, to_categorical(train['SECTION']), epochs=10, batch_size=batch_size, 
          shuffle=True, validation_split=0.1, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x159f5c24970>

In [82]:
pred_lstm = model.predict_classes(X_test)
pred_lstm[:20]

array([1, 2, 1, 0, 1, 1, 1, 2, 1, 2, 0, 3, 2, 1, 2, 1, 3, 2, 3, 2],
      dtype=int64)

In [84]:
df_lstm = pd.DataFrame (pred_lstm)
submission['SECTION'] = df_lstm.values
filepath = 'MH_news-text_catagorzation_lstm.xlsx'
submission.to_excel(filepath, index= False)