<a href="https://colab.research.google.com/github/solong96/solong96/blob/main/0818_cc_text_classfication.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers.rnn import LSTM, GRU,SimpleRNN
from keras.layers.core import Dense, Activation, Dropout, Embedding
from keras.layers.normalization import batch_normalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.utils import pad_sequences
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping


import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
import os

In [3]:
path = '/content/drive/MyDrive/0809/'
os.chdir(path)

In [4]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

Running on TPU  grpc://10.105.18.242:8470
REPLICAS:  8


In [7]:
train_dataset = pd.read_excel('0818_cc_train_token_title.xlsx')
train_dataset

Unnamed: 0.1,Unnamed: 0,Abstract,Title,category,toxic,token
0,0,By 2050s the UK is expected to experience: inc...,advent climate-change resultant energy relate ...,building,15,UK experience increase average summer mean tem...
1,1,The urban heat island effect (UHI) is a produc...,problem lack green-space rise surface temperat...,green,21,urban heat-island UHU product urbanization cer...
2,2,Water distribution systems (WDSs) contribute t...,life-cycle carbon footprint redesign water dis...,traffic,14,water distribution system wdss contribute unde...
3,3,Renewable energy is considered crucial for cli...,low carbon electricity system indium multi-obj...,energy,11,renewable energy crucial climate-change mitiga...
4,4,Intensification of the water cycle mediated by...,climate-change socio-hydrological space brahma...,water,23,intensification water cycle global warming inc...
...,...,...,...,...,...,...
19063,19063,This paper presents a multi-criteria optimizat...,multi-criterion optimisation power system low-...,energy,11,present multi-criterion optimization power sys...
19064,19064,The employment of solar space heating is a sig...,optimization solar space heating system therma...,energy,11,employment solar space heating significant mea...
19065,19065,Coal is the important support to guarantee the...,influence synergistic airflow vibration compou...,water,23,coal important support guarantee national ener...
19066,19066,Energy-efficient buildings and renewable power...,demand response flexibility passive PCM wall b...,building,15,energy-efficient building renewable power supp...


In [9]:
train = train_dataset[['Title','toxic']]
train

Unnamed: 0,Title,toxic
0,advent climate-change resultant energy relate ...,15
1,problem lack green-space rise surface temperat...,21
2,life-cycle carbon footprint redesign water dis...,14
3,low carbon electricity system indium multi-obj...,11
4,climate-change socio-hydrological space brahma...,23
...,...,...
19063,multi-criterion optimisation power system low-...,11
19064,optimization solar space heating system therma...,11
19065,influence synergistic airflow vibration compou...,23
19066,demand response flexibility passive PCM wall b...,15


In [10]:
print(train['Title'].apply(lambda x:len(str(x).split())).max())
max_len = (round(train['Title'].apply(lambda x:len(str(x).split())).max()/100)*100)+50
print(max_len)

33
50


In [None]:
def roc_auc(predictions,target):
    '''
    This methods returns the AUC Score when given the Predictions
    and Labels
    '''

    fpr, tpr, thresholds = metrics.roc_curve(target, predictions)
    roc_auc = metrics.auc(fpr, tpr)
    return roc_auc

In [16]:
xtrain, xvalid, ytrain, yvalid = train_test_split(train.Title.values, train.toxic.values,
                                                  stratify=train.toxic.values,
                                                  random_state=42,
                                                  test_size=0.2, shuffle=True)

In [17]:
# using keras tokenizer here
token = text.Tokenizer(num_words=None)
# max_len = 1000

token.fit_on_texts(list(xtrain) + list(xvalid))
xtrain_seq = token.texts_to_sequences(xtrain)
xvalid_seq = token.texts_to_sequences(xvalid)

#zero pad the sequences
xtrain_pad = pad_sequences(xtrain_seq, maxlen=max_len)
xvalid_pad = pad_sequences(xvalid_seq, maxlen=max_len)

word_index = token.word_index

In [18]:
# Calculating the Vocabulary Size and the number of Answers

print("Vocabulary size={}".format(len(token.word_index)))
print("Number of Answers={}".format(token.document_count))

Vocabulary size=11494
Number of Answers=19068


In [19]:
le = LabelEncoder()
num_classes=10 # 1 = Society & Culture 2 = Science & Mathematics 3 = Health 4 = Education & Reference 5 = Computers & Internet 6 = Sports 7 = Business & Finance 8 = Entertainment & Music 9 = Family & Relationships 10 = Politics & Government

ytrain = le.fit_transform(ytrain)
yvalid = le.transform(yvalid)

ytrain = to_categorical(ytrain)
yvalid = to_categorical(yvalid)

In [20]:
ytrain

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]], dtype=float32)

In [21]:
ytrain.shape

(15254, 10)

# Word Embeddings

While building our simple RNN models we talked about using word-embeddings , So what is word-embeddings and how do we get word-embeddings?
Here is the answer :
* https://www.coursera.org/learn/nlp-sequence-models/lecture/6Oq70/word-representation
* https://machinelearningmastery.com/what-are-word-embeddings/
<br> <br>
The latest approach to getting word Embeddings is using pretained GLoVe or using Fasttext. Without going into too much details, I would explain how to create sentence vectors and how can we use them to create a machine learning model on top of it and since I am a fan of GloVe vectors, word2vec and fasttext. In this Notebook, I'll be using the GloVe vectors. You can download the GloVe vectors from here http://www-nlp.stanford.edu/data/glove.840B.300d.zip or you can search for GloVe in datasets on Kaggle and add the file

In [None]:
# load the GloVe vectors in a dictionary:

embeddings_index = {}
f = open('glove.6B.300d.txt','r',encoding='utf-8')
for line in tqdm(f):
    values = line.split(' ')
    word = values[0]
    coefs = np.asarray([float(val) for val in values[1:]])
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

278935it [00:41, 8757.49it/s]

# LSTM's

## Basic Overview

Simple RNN's were certainly better than classical ML algorithms and gave state of the art results, but it failed to capture long term dependencies that is present in sentences . So in 1998-99 LSTM's were introduced to counter to these drawbacks.

## In Depth Understanding

Why LSTM's?
* https://www.coursera.org/learn/nlp-sequence-models/lecture/PKMRR/vanishing-gradients-with-rnns
* https://www.analyticsvidhya.com/blog/2017/12/fundamentals-of-deep-learning-introduction-to-lstm/

What are LSTM's?
* https://www.coursera.org/learn/nlp-sequence-models/lecture/KXoay/long-short-term-memory-lstm
* https://distill.pub/2019/memorization-in-rnns/
* https://towardsdatascience.com/illustrated-guide-to-lstms-and-gru-s-a-step-by-step-explanation-44e9eb85bf21

# Code Implementation

We have already tokenized and paded our text for input to LSTM's

In [None]:
# create an embedding matrix for the words we have in the dataset
embedding_matrix = np.zeros((len(word_index) + 1, 300)) # 300은 차원 수
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

100%|██████████| 48074/48074 [00:00<00:00, 485322.58it/s]


LSTM(n, input_dim = k, input_length = k)

n : 메모리 셀의 개수(기억용량, 출력형태를 결정함)

In [None]:
# %%time
with strategy.scope():

    # A simple LSTM with glove embeddings and one dense layer
    model = Sequential()
    model.add(Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))

    model.add(LSTM(100, dropout=0.3, recurrent_dropout=0.3))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(15, activation='softmax'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1200, 300)         14422500  
                                                                 
 lstm (LSTM)                 (None, 100)               160400    
                                                                 
 dense (Dense)               (None, 24)                2424      
                                                                 
 dense_1 (Dense)             (None, 15)                375       
                                                                 
Total params: 14,585,699
Trainable params: 163,199
Non-trainable params: 14,422,500
_________________________________________________________________


In [None]:
model.fit(xtrain_pad, ytrain, validation_data = (xvalid_pad, yvalid), epochs=5, batch_size=64*strategy.num_replicas_in_sync)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x231e5854950>

In [None]:
scores = model.evaluate(xvalid_pad, yvalid)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 63.06%


In [None]:
model.save('15_LSTM_02.h5', save_format='h5')
model.save('15_LSTM_02.keras')

In [None]:
import pickle
pickle.dump(model, open('15_LSTM_02.pkl', 'wb'))

In [None]:
yvalid_model = model.predict(xvalid_pad)
yvalid_model

In [None]:
print(xvalid[12])
print(np.where(scores[11] == np.max(scores[11])))
print(np.where(yvalid[11] == np.max(yvalid[11])))

multiple anthropogenic stressor co-occur natural ecosystem multiple stressor study nature direction stressor interaction strength stressor examine coral alpha beta-diversity vary site gradient chronic local anthropogenic stress marine heatwave multiple stressor framework encompass non-discrete stressor examine interaction continuou discrete stressor additive antagonistic interaction heatwave-driven turnover coral community composition continuou stressor point response coral hill-richness stressor additive synergistic community-level response multiple stressor vary stressor intensity importance complex realistic continuou stressor understand stressor interaction ecological
(array([7], dtype=int64),)
(array([5], dtype=int64),)
