In [105]:
#!pip install snorkel
#!pip install textblob
import io
import pandas as pd
#Snorkel
from snorkel.labeling import LabelingFunction
import re
from snorkel.preprocess import preprocessor
from textblob import TextBlob
from snorkel.labeling import PandasLFApplier
from snorkel.labeling.model import LabelModel
from snorkel.labeling import LFAnalysis
from snorkel.labeling import filter_unlabeled_dataframe
from snorkel.labeling import labeling_function
#NLP packages
import spacy
from nltk.corpus import stopwords
import string
import nltk
import nltk.tokenize
punc = string.punctuation
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
#Supervised learning
from tqdm import tqdm_notebook as tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
##Deep learning libraries and APIs
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import GlobalAveragePooling1D
from tensorflow.keras.layers import Dense

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\serda\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [106]:
import warnings
warnings.filterwarnings('ignore') 

In [107]:
import requests
import pandas as pd

url = "https://api.nytimes.com/svc/archive/v1/2020/1.json"
api_key = "API_KEY"

query_params = {'api-key': api_key}
months = range(1, 13)  # retrieve headlines for all months of the year 2020
headlines = []
dates = []
categories = []

for month in months:
    url = f"https://api.nytimes.com/svc/archive/v1/2020/{month}.json"
    response = requests.get(url, params=query_params)
    if response.status_code == 200:
        data = response.json()
        for article in data['response']['docs']:
            headlines.append(article['headline']['main'])
            dates.append(article['pub_date'][:10])
            categories.append(article['news_desk'])
    else:
        print("Error: ", response.status_code)

df = pd.DataFrame({'Headline': headlines, 'Category': categories, 'Date': dates})
print(df)


                                                Headline     Category  \
0      ‘Battling a Demon’: Drifter Sought Help Before...     National   
1                            Protect Veterans From Fraud    Editorial   
2      F.D.A. Plans to Ban Most E-Cigarette Flavors b...      Science   
3                                 ‘It’s Green and Slimy’        Games   
4                              Corrections: Jan. 1, 2020  Corrections   
...                                                  ...          ...   
55488  ‘Ratatouille,’ the Musical: How This TikTok Cr...      Weekend   
55489    Some recipe ideas for a New Year’s Eve at home.     National   
55490  Microsoft Says Russian Hackers Viewed Some of ...     Business   
55491  New in Paperback: ‘The Red Lotus’ and ‘This Is...   BookReview   
55492            The World Begins Saying Goodbye to 2020                

             Date  
0      2020-01-01  
1      2020-01-01  
2      2020-01-01  
3      2020-01-01  
4      2020-01-01  
...

In [108]:
df = df.drop(['Date'], axis=1)
df = df.drop(['Category'], axis=1)
df = df.rename(columns = {'Headline': 'text'})
df['text'] = df['text'].astype(str)
#check the data info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55493 entries, 0 to 55492
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    55493 non-null  object
dtypes: object(1)
memory usage: 433.7+ KB


In [109]:
#define constants to represent the class labels :positive, negative, and abstain
POSITIVE = 1
NEGATIVE = 0
ABSTAIN = -1
#define function which looks into the input words to represent a proper label
def keyword_lookup(x, keywords, label):  
    if any(word in x.text.lower() for word in keywords):
        return label
    return ABSTAIN
#define function which assigns a correct label
def make_keyword_lf(keywords, label=POSITIVE):
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label))
#resource: https://www.snorkel.org/use-cases/01-spam-tutorial#3-writing-more-labeling-functions
#these two lists can be further extended 
"""positive news might contain the following words' """
keyword_positive = make_keyword_lf(keywords=['boosts', 'great', 'develops', 'promising', 'ambitious', 'delighted', 'record', 'win', 'breakthrough', 'recover', 'achievement', 'peace', 'party', 'hope', 'flourish', 'respect', 'partnership', 'champion', 'positive', 'happy', 'bright', 'confident', 'encouraged', 'perfect', 'complete', 'assured' ])
"""negative news might contain the following words"""
keyword_negative = make_keyword_lf(keywords=['war','solidiers', 'turmoil', 'injur','trouble', 'aggressive', 'killed', 'coup', 'evasion', 'strike', 'troops', 'dismisses', 'attacks', 'defeat', 'damage', 'dishonest', 'dead', 'fear', 'foul', 'fails', 'hostile', 'cuts', 'accusations', 'victims',  'death', 'unrest', 'fraud', 'dispute', 'destruction', 'battle', 'unhappy', 'bad', 'alarming', 'angry', 'anxious', 'dirty', 'pain', 'poison', 'unfair', 'unhealthy'
                                              ], label=NEGATIVE)

In [110]:
#set up a preprocessor function to determine polarity & subjectivity using textlob pretrained classifier 
@preprocessor(memoize=True)
def textblob_sentiment(x):
    scores = TextBlob(x.text)
    x.polarity = scores.sentiment.polarity
    x.subjectivity = scores.sentiment.subjectivity
    return x
#find polarity
@labeling_function(pre=[textblob_sentiment])
def textblob_polarity(x):
    return POSITIVE if x.polarity > 0.6 else ABSTAIN
#find subjectivity 
@labeling_function(pre=[textblob_sentiment])
def textblob_subjectivity(x):
    return POSITIVE if x.subjectivity >= 0.5 else ABSTAIN

In [111]:
#combine all the labeling functions 
lfs = [keyword_positive, keyword_negative, textblob_polarity, textblob_subjectivity ]
#apply the lfs on the dataframe
applier = PandasLFApplier(lfs=lfs)
L_snorkel = applier.apply(df=df)
#apply the label model
label_model = LabelModel(cardinality=2, verbose=True)
#fit on the data
label_model.fit(L_snorkel)
#predict and create the labels
df["label"] = label_model.predict(L=L_snorkel)

100%|██████████████████████████████████████████████████████████████████████████| 55493/55493 [00:51<00:00, 1074.70it/s]
INFO:root:Computing O...
INFO:root:Estimating \mu...
  0%|                                                                                       | 0/100 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=0.025]
INFO:root:[10 epochs]: TRAIN:[loss=0.010]
INFO:root:[20 epochs]: TRAIN:[loss=0.001]
INFO:root:[30 epochs]: TRAIN:[loss=0.001]
INFO:root:[40 epochs]: TRAIN:[loss=0.001]
INFO:root:[50 epochs]: TRAIN:[loss=0.000]
INFO:root:[60 epochs]: TRAIN:[loss=0.000]
INFO:root:[70 epochs]: TRAIN:[loss=0.000]
 72%|███████████████████████████████████████████████████████▍                     | 72/100 [00:00<00:00, 711.60epoch/s]INFO:root:[80 epochs]: TRAIN:[loss=0.000]
INFO:root:[90 epochs]: TRAIN:[loss=0.000]
100%|████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 696.96epoch/s]
INFO:root:Finished Training


In [112]:
#Filtering out unlabeled data points
df= df.loc[df.label.isin([0,1]), :]
#find the label counts 
df['label'].value_counts()

1    13369
0     3679
Name: label, dtype: int64

In [113]:
#make a copy of the dataframe
data = df.copy()
#define a function which handles the text preprocessing 
def preparation_text_data(data):
    """
    This pipeline prepares the text data, conducting the following steps:
    1) Tokenization
    2) Lemmatization
    4) Removal of stopwords
    5) Removal of punctuation
    """
    # initialize spacy object
    nlp = spacy.load('en_core_web_sm')
    # select raw text
    raw_text = data.text.values.tolist()
    # tokenize
    tokenized_text = [[nlp(i.lower().strip())] for i in tqdm(raw_text)]
    #define the punctuations and stop words
    punc = string.punctuation 
    stop_words = set(stopwords.words('english'))
    #lemmatize, remove stopwords and punctuationd
    corpus = []
    for doc in tqdm(tokenized_text):
        corpus.append([word.lemma_ for word in doc[0] if (word.lemma_ not in stop_words and word.lemma_ not in punc)])
    # add prepared data to df
    data["text"] = corpus
    return data
#apply the data preprocessing function
data =  preparation_text_data(data)

  0%|          | 0/17048 [00:00<?, ?it/s]

  0%|          | 0/17048 [00:00<?, ?it/s]

In [114]:
def text_representation(data):
  tfidf_vect = TfidfVectorizer()
  data['text'] = data['text'].apply(lambda text: " ".join(set(text)))
  X_tfidf = tfidf_vect.fit_transform(data['text'])
  print(X_tfidf.shape)
  print(tfidf_vect.get_feature_names())
  X_tfidf = pd.DataFrame(X_tfidf.toarray())
  return X_tfidf
#apply the TFIDV function
X_tfidf = text_representation(data)

(17048, 12721)


In [115]:
X= X_tfidf
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
#fit Log Regression Model
clf= LogisticRegression()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.62      0.73      1184
           1       0.91      0.97      0.94      4442

    accuracy                           0.90      5626
   macro avg       0.89      0.80      0.83      5626
weighted avg       0.90      0.90      0.89      5626



In [116]:
new_data = ["The US imposes sanctions on Rassia because of the Ukranian war"]
tf = TfidfVectorizer()
tfdf = tf.fit_transform(data['text'])
vect = pd.DataFrame(tf.transform(new_data).toarray())
new_data = pd.DataFrame(vect)
logistic_prediction = clf.predict(new_data)
print(logistic_prediction)

[0]


In [117]:
text = list(data['text'])
labels = list(data['label'])
##sentences
training_text = text[0:13200]
testing_text = text[13200:]
##labels
training_labels = labels[0:13200]
testing_labels = labels[13200:]

In [118]:
#preprocess 
tokenizer = Tokenizer(num_words=10000, oov_token= "<OOV>")
tokenizer.fit_on_texts(training_text)
word_index = tokenizer.word_index
training_sequences = tokenizer.texts_to_sequences(training_text)
training_padded = pad_sequences(training_sequences, maxlen=120, padding='post', truncating='post')
testing_sequences = tokenizer.texts_to_sequences(testing_text)
testing_padded = pad_sequences(testing_sequences, maxlen=120, padding='post', truncating='post')
# convert lists into numpy arrays to make it work with TensorFlow 
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [119]:
model = Sequential([
    Embedding(10000, 16, input_length=120),
    GlobalAveragePooling1D(),
    Dense(24, activation='relu'),
    Dense(1, activation='sigmoid')
])
##compile the model
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
 
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 120, 16)           160000    
                                                                 
 global_average_pooling1d_2   (None, 16)               0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_4 (Dense)             (None, 24)                408       
                                                                 
 dense_5 (Dense)             (None, 1)                 25        
                                                                 
Total params: 160,433
Trainable params: 160,433
Non-trainable params: 0
_________________________________________________________________


In [120]:
num_epochs = 10
history = model.fit(training_padded, 
                    training_labels, 
                    epochs=num_epochs, 
                    validation_data=(testing_padded, testing_labels), 
                    verbose=2)

Epoch 1/10
413/413 - 2s - loss: 0.5425 - accuracy: 0.7766 - val_loss: 0.4836 - val_accuracy: 0.8072 - 2s/epoch - 4ms/step
Epoch 2/10
413/413 - 1s - loss: 0.5144 - accuracy: 0.7775 - val_loss: 0.4703 - val_accuracy: 0.8072 - 995ms/epoch - 2ms/step
Epoch 3/10
413/413 - 1s - loss: 0.4588 - accuracy: 0.7790 - val_loss: 0.3907 - val_accuracy: 0.8176 - 1s/epoch - 2ms/step
Epoch 4/10
413/413 - 1s - loss: 0.2986 - accuracy: 0.8736 - val_loss: 0.2593 - val_accuracy: 0.8833 - 1s/epoch - 3ms/step
Epoch 5/10
413/413 - 1s - loss: 0.1903 - accuracy: 0.9364 - val_loss: 0.2073 - val_accuracy: 0.9161 - 992ms/epoch - 2ms/step
Epoch 6/10
413/413 - 1s - loss: 0.1422 - accuracy: 0.9530 - val_loss: 0.1903 - val_accuracy: 0.9189 - 1s/epoch - 2ms/step
Epoch 7/10
413/413 - 1s - loss: 0.1143 - accuracy: 0.9630 - val_loss: 0.1812 - val_accuracy: 0.9244 - 1s/epoch - 3ms/step
Epoch 8/10
413/413 - 1s - loss: 0.0955 - accuracy: 0.9694 - val_loss: 0.1644 - val_accuracy: 0.9376 - 1s/epoch - 2ms/step
Epoch 9/10
413/413

In [121]:
new_headline = ["The US imposes sanctions on Rassia because of the Ukranian war"]
##prepare the sequences of the sentences in question
sequences = tokenizer.texts_to_sequences(new_headline)
padded_seqs = pad_sequences(sequences, maxlen=120, padding='post', truncating='post')
print(model.predict(padded_seqs))

[[0.05457079]]
