In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.layers import Dropout
import tensorflow as tf
import re
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.pipeline import Pipeline
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import KFold, cross_val_score, RepeatedKFold
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import unicodedata
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling  import RandomOverSampler
from sklearn.metrics import accuracy_score, f1_score
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
stopwords = set(stopwords.words('english'))
import glob
# spacy for lemmatization
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()


ModuleNotFoundError: No module named 'spacy'

In [None]:
#hyperparameters
vocab_size = 10000
embedding_dim = 64
max_length = 200
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
training_portion = .8

In [None]:
# stopwords
stop_words = stopwords
extra_stopwords = ['the','said','say','shares','person','useful','govtech','cio','yonhap','size','tackle','right','day','tried','tested','make','sure','used','help','yesterday','today','tomorrow','percent','per', 'cent','could','many','add','use','need','goods','million','thousand','company','retailers','saw','see','new','like','today','tomorrow','guide',
 'people','want','yet','way','time','back','whether','if','yes','older','noted','went','told','tell','younger','another','worth','noting','well','called','named','never','lee','quah','ong','ng','lim','tan','shared','says','say','said','cio', 'cios','month','top','world','zero','one','two','three','four','five','six','seven','eight','nine','ten','january', 'february', 'march', 'april', 'may', 'june', 'july',
              'august', 'september', 'october', 'november', 'december','month','months','years','year','near','also','would','able']
for word in extra_stopwords:
    stop_words.add(word)

In [None]:
path2 ='../data'
filenames = glob.glob(path2 + "/*.csv")

dfs = []
for filename in filenames:
    print(filename)
    df = pd.read_csv(filename, index_col=0)
#     df['source'] =re.search('\/([A-Za-z]+)[0-9]?\.csv',filename)[1]
    dfs.append(df)

# Concatenate all data into one DataFrame
big_frame = pd.concat(dfs, ignore_index=True)
big_frame.head()

In [None]:
def thorough_cleaning(text):
    text = ''.join(text)
    text = text.strip()
    text = " ".join(text.split())
    text = re.sub("[^0-9a-zA-Z]"," ", text)
    text = re.sub('[0-9]{2,4}', ' ', text)
    article_text = re.sub(r'\s+', ' ', text)
    article_text = ' '.join([w.lower() for w in article_text.split() if len(w) > 1 and w.lower() not in stop_words])
    article_text = unicodedata.normalize("NFKD", article_text)
    return article_text

def count_words(text):
    return len(text)

def process_data(df):
    #most articles append the header to the csv so cleaning is required
#     df = df[(df['date'] != 'date') & (df['category'] != 'category')& (df['category'] != 'nation')&(df['date'] != None)]
    #format dates
    df['date'] = pd.to_datetime(df['date'], format = '%d/%m/%Y')
    #remove duplicates and nan
    df.drop_duplicates(subset =['title', 'category'], inplace=True)
    df.dropna(axis =0, subset =['text', 'title'], inplace=True)
    #cleaning the text to convert to lists
    df['clean_text'] = df['text'].apply(lambda x:thorough_cleaning(x))
    #word count of text
    df['word_count'] = df['text'].apply(lambda x:count_words(x))
    #remove those which are too short
    df2 = df[df.word_count >=50]
    return df2.reset_index()

In [None]:
# remove all those without category
# data =df[~df.category.isna()]
# data.head()
df = process_data(big_frame)

In [None]:
mapping ={v:i for i, v in enumerate(df.category.unique())}
reverse_map = {str(i):v for i,v in  enumerate(df.category.unique())}
df['labels'] = df.category.map(mapping)
# catmap ={'digital-transformation':'digitalisation', 'analytics/ai/ml':'analytics', 'business':'business',
#        'cybersecurity':'cybersecurity', 'apps/development/platforms':'apps'}
# data.labels = data.new_cat.map(catmap)

In [None]:
 df[['title','text','category','url','blurp']].tail()

In [None]:
df

In [None]:
cat_count = {str(c): 0 for c in mapping.keys()}

In [None]:
cat_count

In [None]:
cat_count = {str(c): 0 for c in mapping.keys()}
for l in df.labels:
#     print(type(l))
    cat_count[reverse_map[str(l)]] += 1
    

In [None]:
cat_count

In [None]:
rus = RandomUnderSampler(random_state=0)
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = rus.fit_resample(df[['title','clean_text']], df.labels)

In [None]:
X_resampled['labels']=y_resampled
X_resampled

In [None]:
data = pd.pivot_table(df[['title','clean_text','labels']], index=['title','clean_text'], columns=['labels'], aggfunc=len, fill_value=0).reset_index()

In [None]:
data

In [None]:
# get a list of models to evaluate
def get_models():
    models = dict()
    algo=dict()
    
    for model in [('logreg',MultiOutputClassifier(estimator=LogisticRegression(multi_class='multinomial'))),('dt',DecisionTreeClassifier()),('rf',RandomForestClassifier()),('rf2',RandomForestClassifier(n_estimators=1000)),
                  ('rf3',RandomForestClassifier(n_estimators=2000)),('gb',MultiOutputClassifier(estimator=GradientBoostingClassifier())),('knn',KNeighborsClassifier(n_neighbors=2)),('knn2',KNeighborsClassifier(n_neighbors=4))]:
        steps = [ ('tfidf', TfidfVectorizer(stop_words=stop_words)), ('m', model[1])]
        models[model[0]] = Pipeline(steps=steps)
#         print(model[0], algo)
    
    return models
 
# evaluate a given model using cross-validation
def evaluate_model(model, X, y,scoring_metric = 'accuracy'):
    cv = KFold(n_splits=10, random_state=1, shuffle = True)
    scores = cross_val_score(model, X, y, scoring=scoring_metric, cv=cv, n_jobs=-1, error_score='raise')
    return scores
 

# get the models to evaluate
models = get_models()
models

In [None]:
models = get_models()
# evaluate the models and store results
results, names = list(), list()

scores = evaluate_model(models['logreg'], data.clean_text, data[[0,1,2,3,4]],'accuracy')
roc_auc = evaluate_model(models['logreg'], data.clean_text, data[[0,1,2,3,4]], 'roc_auc')
results.append(scores)

print('logreg  accuracy: %.4f roc_auc:%.4f (%.4f)' % ( np.mean(scores),np.mean(roc_auc), np.std(scores)))

In [None]:
models = get_models()
# evaluate the models and store results
results, names = list(), list()

scores = evaluate_model(models['rf'], data.clean_text, data[[0,1,2,3,4]],'accuracy')
roc_auc = evaluate_model(models['rf'], data.clean_text, data[[0,1,2,3,4]], 'roc_auc')
results.append(scores)

print('random forest  accuracy: %.4f roc_auc:%.4f (%.4f)' % ( np.mean(scores),np.mean(roc_auc), np.std(scores)))

In [None]:
models = get_models()
# evaluate the models and store results
results, names = list(), list()

scores = evaluate_model(models['rf2'], data.clean_text, data[[0,1,2,3,4]],'accuracy')
roc_auc = evaluate_model(models['rf2'], data.clean_text, data[[0,1,2,3,4]], 'roc_auc')
results.append(scores)

print('random forest  accuracy: %.4f roc_auc:%.4f (%.4f)' % ( np.mean(scores),np.mean(roc_auc), np.std(scores)))

In [None]:
models = get_models()
# evaluate the models and store results
results, names = list(), list()

scores = evaluate_model(models['rf3'], data.clean_text, data[[0,1,2,3,4]],'accuracy')
roc_auc = evaluate_model(models['rf3'], data.clean_text, data[[0,1,2,3,4]], 'roc_auc')
results.append(scores)

print('random forest  accuracy: %.4f roc_auc:%.4f (%.4f)' % ( np.mean(scores),np.mean(roc_auc), np.std(scores)))

In [None]:
models = get_models()
# evaluate the models and store results
results, names = list(), list()

scores = evaluate_model(models['dt'], data.clean_text, data[[0,1,2,3,4]],'accuracy')
roc_auc = evaluate_model(models['dt'], data.clean_text, data[[0,1,2,3,4]], 'roc_auc')
results.append(scores)

print('decision tree  accuracy: %.4f roc_auc:%.4f (%.4f)' % ( np.mean(scores),np.mean(roc_auc), np.std(scores)))
# plot model performance for comparison
# plt.boxplot(results, labels=names, showmeans=True)
# plt.xticks(rotation=45)
# plt.show()

In [None]:
scores = evaluate_model(models['knn'], data.clean_text, data[[0,1,2,3,4]])
roc_auc = evaluate_model(models['knn'], data.clean_text, data[[0,1,2,3,4]], 'roc_auc')
results.append(scores)

print('KNN accuracy: %.4f roc_auc:%.4f (%.4f)' % (np.mean(scores),np.mean(roc_auc), np.std(scores)))


In [None]:
scores = evaluate_model(models['gb'], data.clean_text, data[[0,1,2,3,4]])
roc_auc = evaluate_model(models['gb'], data.clean_text, data[[0,1,2,3,4]], 'roc_auc')
results.append(scores)

print('GB accuracy: %.4f roc_auc:%.4f (%.4f)' % (np.mean(scores),np.mean(roc_auc), np.std(scores)))


#### Tokenize and form sequence
remove punctuation, transform to lower and split sentence into words
Form sequence of tokens

In [None]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

In [None]:
dict(list(word_index.items())[0:10])

In [None]:
#### Convert token into list of sequence
train_sequences = tokenizer.texts_to_sequences(texts=X_train)

#### Standardize length
For NLP texts would have to be the same length, hence padding is required to concat those text which is too long and add more text if it is too short

In [None]:
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [None]:
print(len(train_sequences[0]))
print(len(train_padded[0]))

print(len(train_sequences[1]))
print(len(train_padded[1]))

print(len(train_sequences[10]))
print(len(train_padded[10]))

In [None]:
y_train.groupby(y_train).size()

In [None]:
y_test.groupby(y_test).size()

#### Do the same transformation to validation/ test
Tokenize, make sequence and pad

In [None]:
validation_sequences = tokenizer.texts_to_sequences(X_test)
validation_padded = pad_sequences(validation_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

print(len(validation_sequences))
print(validation_padded.shape)

In [None]:
#as tokenize labels
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(data.labels)

training_label_seq = np.array(label_tokenizer.texts_to_sequences(y_train))
validation_label_seq = np.array(label_tokenizer.texts_to_sequences(y_test))

In [None]:
y_train

In [None]:

weight_dict = max(y_train.groupby(y_train).size())/y_train.groupby(y_train).size()

In [None]:
weight_dict= weight_dict.to_dict()
weight_dict

In [None]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_article(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])
print(decode_article(train_padded[10]))
print('---')
print(X_train.reset_index().loc[10])

In [None]:
### Baseline model without adding any weights

### Ways to reduce overfitting

Neural networks might overfit on the training set resulting in poor performance on the test set we can reduce overfitting by
Regularization
1)L2 & L1 regularization
L1 and L2 are the most common types of regularization. These update the general cost function by adding another term known as the regularization term.
2)Dropout
At every iteration, it randomly selects some nodes and removes them along with all of their incoming and outgoing connections as shown below

In [None]:
#testing model with dropout
model = tf.keras.Sequential([
    # Add an Embedding layer expecting input vocab of size 5000, and output embedding dimension of size 64 we set at the top
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
#    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    # use ReLU in place of tanh function since they are very good alternatives of each other.
    tf.keras.layers.Dense(embedding_dim, activation='relu'),
    Dropout(0.25),
    # Add a Dense layer with 6 units and softmax activation.
    # When we have multiple outputs, softmax convert outputs layers into a probability distribution.
    tf.keras.layers.Dense(6, activation='softmax')
])
model.summary()

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
num_epochs = 10
# history = model.fit(train_padded, training_label_seq, epochs=num_epochs, validation_data=(validation_padded, validation_label_seq), verbose=2)
history = model.fit(train_padded, y_train, epochs=num_epochs, validation_data=(validation_padded, y_test), verbose=2)

In [None]:
import matplotlib.pyplot as plt
def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()
  
plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

In [None]:
#testing model with L1&L2 regularization
model = tf.keras.Sequential([
    # Add an Embedding layer expecting input vocab of size 5000, and output embedding dimension of size 64 we set at the top
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)),
#    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    # use ReLU in place of tanh function since they are very good alternatives of each other.
    tf.keras.layers.Dense(embedding_dim, activation='relu'),
    Dropout(0.25),
    tf.keras.layers.Dense(units=64,kernel_regularizer=tf.keras.regularizers.l1_l2(l1=1e-5, l2=1e-4)),
    #6, kernel_regularizer=tf.keras.regularizers.l2(0.01))
    # Add a Dense layer with 6 units and softmax activation.
    # When we have multiple outputs, softmax convert outputs layers into a probability distribution.
    tf.keras.layers.Dense(6, activation='softmax')
])
model.summary()

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
num_epochs = 10
history = model.fit(train_padded, training_label_seq, epochs=num_epochs, validation_data=(validation_padded, validation_label_seq), verbose=2)

In [None]:
import os
import tempfile
initial_weights = os.path.join(tempfile.mkdtemp(), 'initial_weights')
model.save_weights(initial_weights)

In [None]:
 label_dict= label_tokenizer.word_index
    

In [None]:
class_weight = {label_dict[k] : v for k, v in weight_dict.items()}
class_weight


In [None]:
## adding weights to the model for imbalanced classes
# Scaling by total/2 helps keep the loss to a similar magnitude.
# The sum of the weights of all examples stays the same.
weight_dict

In [None]:
weighted = model.fit(train_padded, training_label_seq, epochs=num_epochs, validation_data=(validation_padded, validation_label_seq),class_weight=class_weight, verbose=2)

In [None]:


# get the model
def get_model(n_inputs, n_outputs):
	model = Sequential()
	model.add(Dense(20, input_dim=n_inputs, kernel_initializer='he_uniform', activation='relu'))
	model.add(Dense(n_outputs, activation='sigmoid'))
	model.compile(loss='binary_crossentropy', optimizer='adam')
	return model


# evaluate a model using repeated k-fold cross-validation
def evaluate_model(X, y):
	results = list()
	n_inputs, n_outputs = X.shape[1], y.shape[1]
	# define evaluation procedure
	cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
	# enumerate folds
	for train_ix, test_ix in cv.split(X):
		# prepare data
		X_train, X_test = X[train_ix], X[test_ix]
		y_train, y_test = y.iloc[train_ix], y.iloc[test_ix]
		# define model
		model = get_model(n_inputs, n_outputs)
		# fit model
		model.fit(X_train, y_train, verbose=0, epochs=100)
		# make a prediction on the test set
		yhat = model.predict(X_test)
		# round probabilities to class labels
		yhat = yhat.round()
		# calculate accuracy
		acc = accuracy_score(y_test, yhat)
		# store result
		print('>%.3f' % acc)
		results.append(acc)
	return results


In [None]:
vectorizer = TfidfVectorizer(stop_words=stop_words)
X = vectorizer.fit_transform(data.clean_text)

# evaluate model
results = evaluate_model(X.toarray(),  data[[0,1,2,3,4]])
# summarize performance
print('Accuracy: %.3f (%.3f)' % (np.mean(results), np.std(results)))

In [None]:
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
for train_ix, test_ix  in cv.split(X):
    print(test_ix)