In [20]:
import pandas as pd

# Snorkel
from snorkel.labeling import LabelingFunction
import re
from snorkel.preprocess import preprocessor
from textblob import TextBlob
from snorkel.labeling import PandasLFApplier
from snorkel.labeling.model import LabelModel
from snorkel.labeling import LFAnalysis
from snorkel.labeling import filter_unlabeled_dataframe
from snorkel.labeling import labeling_function

# NLP packages
import spacy
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lemm = WordNetLemmatizer()
import string
import nltk
import nltk.tokenize

# punc = string.punctuation
# nltk.download('stopwords')
# stop_words = set(stopwords.words('english'))

# Supervised learning
from tqdm import tqdm_notebook as tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Deep learning libraries and APIs
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [7]:
# store the dataset as a Pandas Dataframe
df = pd.read_csv('datasets/abcnews-date-text.csv', error_bad_lines=False)
#conduct some data cleaning
df = df.drop(['publish_date'], axis=1)
df = df.rename(columns={'headline_text': 'text'})
# df['text'] = df['text'].astype(str)
#check the data info
df.info()



  df = pd.read_csv('datasets/abcnews-date-text.csv', error_bad_lines=False)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1244184 entries, 0 to 1244183
Data columns (total 1 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   text    1244184 non-null  object
dtypes: object(1)
memory usage: 9.5+ MB


In [8]:
df

Unnamed: 0,text
0,aba decides against community broadcasting lic...
1,act fire witnesses must be aware of defamation
2,a g calls for infrastructure protection summit
3,air nz staff in aust strike for pay rise
4,air nz strike to affect australian travellers
...,...
1244179,two aged care residents die as state records 2...
1244180,victoria records 5;919 new cases and seven deaths
1244181,wa delays adopting new close contact definition
1244182,western ringtail possums found badly dehydrate...


In [9]:
# define constants to represent the class labels :positive, negative, and abstain
POSITIVE = 1
NEGATIVE = 0
ABSTAIN = -1


# define function which looks into the input words to represent a proper label
def keyword_lookup(x, keywords, label):
    if any(word in x.text.lower() for word in keywords):
        return label
    return ABSTAIN


# define function which assigns a correct label
def make_keyword_lf(keywords, label=POSITIVE):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label))


#resource: https://www.snorkel.org/use-cases/01-spam-tutorial#3-writing-more-labeling-functions
#these two lists can be further extended
"""positive news might contain the following words' """
keyword_positive = make_keyword_lf(
    keywords=['boosts', 'great', 'develops', 'promising', 'ambitious', 'delighted', 'record', 'win', 'breakthrough',
              'recover', 'achievement', 'peace', 'party', 'hope', 'flourish', 'respect', 'partnership', 'champion',
              'positive', 'happy', 'bright', 'confident', 'encouraged', 'perfect', 'complete', 'assured'])
"""negative news might contain the following words"""
keyword_negative = make_keyword_lf(
    keywords=['war', 'solidiers', 'turmoil', 'injur', 'trouble', 'aggressive', 'killed', 'coup', 'evasion', 'strike',
              'troops', 'dismisses', 'attacks', 'defeat', 'damage', 'dishonest', 'dead', 'fear', 'foul', 'fails',
              'hostile', 'cuts', 'accusations', 'victims', 'death', 'unrest', 'fraud', 'dispute', 'destruction',
              'battle', 'unhappy', 'bad', 'alarming', 'angry', 'anxious', 'dirty', 'pain', 'poison', 'unfair',
              'unhealthy'
              ], label=NEGATIVE)

In [10]:
# set up a preprocessor function to determine polarity & subjectivity using textlob pretrained classifier
@preprocessor(memoize=True)
def textblob_sentiment(x):
    scores = TextBlob(x.text)
    x.polarity = scores.sentiment.polarity
    x.subjectivity = scores.sentiment.subjectivity
    return x


# find polarity
@labeling_function(pre=[textblob_sentiment])
def textblob_polarity(x):
    return POSITIVE if x.polarity > 0.6 else ABSTAIN


# find subjectivity
@labeling_function(pre=[textblob_sentiment])
def textblob_subjectivity(x):
    return POSITIVE if x.subjectivity >= 0.5 else ABSTAIN

In [11]:
# combine all the labeling functions
lfs = [keyword_positive, keyword_negative, textblob_polarity, textblob_subjectivity]

# apply the lfs on the dataframe
applier = PandasLFApplier(lfs=lfs)
L_snorkel = applier.apply(df=df)

# apply the label model
label_model = LabelModel(cardinality=2, verbose=True)

# fit on the data
label_model.fit(L_snorkel)

# predict and create the labels
df["label"] = label_model.predict(L=L_snorkel)

 84%|████████▍ | 1046879/1244184 [23:34<04:17, 765.30it/s] 

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 348005 entries, 1 to 1244182
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    348005 non-null  object
 1   label   348005 non-null  int32 
dtypes: int32(1), object(1)
memory usage: 6.6+ MB


In [14]:
# Filtering out unlabeled data points
df = df.loc[df.label.isin([0, 1]), :]

# find the label counts
df['label'].value_counts()

1    238018
0    109987
Name: label, dtype: int64

In [18]:
df.to_csv("data_nlp.csv", encoding='utf-8', index=False)

In [23]:
def toLower(data):
    if isinstance(data, float):
        return '<UNK>'
    else:
        return data.lower()


stop_words = stopwords.words("english")


def remove_stopwords(text):
    no_stop = []
    for word in text.split(' '):
        if word not in stop_words:
            no_stop.append(word)
    return " ".join(no_stop)


def remove_punctuation_func(text):
    return re.sub(r'[^a-zA-Z0-9]', ' ', text)

In [24]:
X = df['text']
y = df['label']
X = X.apply(toLower)
X = X.apply(remove_stopwords)
X = X.apply(lambda x: lemm.lemmatize(x))
X = X.apply(remove_punctuation_func)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [25]:
# preprocess
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

word_index = tokenizer.word_index
training_sequences = tokenizer.texts_to_sequences(X_train)
training_padded = pad_sequences(training_sequences, maxlen=120, padding='post', truncating='post')
testing_sequences = tokenizer.texts_to_sequences(X_test)
testing_padded = pad_sequences(testing_sequences, maxlen=120, padding='post', truncating='post')
# convert lists into numpy arrays to make it work with TensorFlow
training_padded = np.array(training_padded)
training_labels = np.array(y_train)
testing_padded = np.array(testing_padded)
testing_labels = np.array(y_test)

In [26]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(10000, 16, input_length=120),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
# compile the model
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 120, 16)           160000    
                                                                 
 global_average_pooling1d_1   (None, 16)               0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_2 (Dense)             (None, 24)                408       
                                                                 
 dense_3 (Dense)             (None, 1)                 25        
                                                                 
Total params: 160,433
Trainable params: 160,433
Non-trainable params: 0
_________________________________________________________________


In [27]:
num_epochs = 5
history = model.fit(training_padded,
                    training_labels,
                    epochs=num_epochs,
                    validation_data=(testing_padded, testing_labels),
                    verbose=2)

Epoch 1/5
8157/8157 - 34s - loss: 0.2106 - accuracy: 0.9100 - val_loss: 0.0788 - val_accuracy: 0.9750 - 34s/epoch - 4ms/step
Epoch 2/5
8157/8157 - 28s - loss: 0.0672 - accuracy: 0.9791 - val_loss: 0.0621 - val_accuracy: 0.9811 - 28s/epoch - 3ms/step
Epoch 3/5
8157/8157 - 32s - loss: 0.0572 - accuracy: 0.9827 - val_loss: 0.0627 - val_accuracy: 0.9827 - 32s/epoch - 4ms/step
Epoch 4/5
8157/8157 - 27s - loss: 0.0533 - accuracy: 0.9840 - val_loss: 0.0589 - val_accuracy: 0.9836 - 27s/epoch - 3ms/step
Epoch 5/5
8157/8157 - 27s - loss: 0.0509 - accuracy: 0.9848 - val_loss: 0.0625 - val_accuracy: 0.9801 - 27s/epoch - 3ms/step


In [28]:
new_headline = ["The US imposes sanctions on Rassia because of the Ukranian war"]
# prepare the sequences of the sentences in question
sequences = tokenizer.texts_to_sequences(new_headline)
padded_seqs = pad_sequences(sequences, maxlen=120, padding='post', truncating='post')
print(model.predict(padded_seqs))

[[0.00845004]]
