In [1]:
import io
import pandas as pd
#NLP packages
import spacy
from nltk.corpus import stopwords
import string
import nltk
import nltk.tokenize
punc = string.punctuation
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
#Supervised learning
from tqdm import tqdm_notebook as tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
##Deep learning libraries and APIs
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [23]:
#load labeled headline data

df = pd.read_csv('/content/just_labels_and_lines.csv')
df = df.rename(columns = {'headlines': 'text'})
df['text'] = df['text'].astype(str)
# Remove leading and trailing whitespace from the 'headlines' column
df['text'] = df['text'].str.strip()


df.tail(n=5)

Unnamed: 0,labels,text
6623,1,Sources: Broncos to hire Morgan St. HC Wheatley
6624,1,OBJ hopes to come through for Rams - and Drake
6625,1,Dolphins to retain Boyer as defensive coordinator
6626,1,'Mattress Mack' bets big again on Bengals to win
6627,1,Fins' McDaniel: 'Extremely proud' to be biracial


In [24]:
#Filtering out unlabeled data points
df= df.loc[df.labels.isin([0,1]), :]
#find the label counts
df['labels'].value_counts()

1    3167
0    2498
Name: labels, dtype: int64

In [25]:
#make a copy of the dataframe
data = df.copy()


In [26]:
#define a function which handles the text preprocessing
def preparation_text_data(data):
    """
    This pipeline prepares the text data, conducting the following steps:
    1) Tokenization
    2) Lemmatization
    4) Removal of stopwords
    5) Removal of punctuation
    """
    # initialize spacy object
    nlp = spacy.load('en_core_web_sm')
    # select raw text
    raw_text = data.text.values.tolist()
    # tokenize
    tokenized_text = [[nlp(i.lower().strip())] for i in tqdm(raw_text)]
    #define the punctuations and stop words
    punc = string.punctuation
    stop_words = set(stopwords.words('english'))
    #lemmatize, remove stopwords and punctuationd
    corpus = []
    for doc in tqdm(tokenized_text):
        corpus.append([word.lemma_ for word in doc[0] if (word.lemma_ not in stop_words and word.lemma_ not in punc)])
    # add prepared data to df
    data["text"] = corpus
    return data

In [27]:
processed_data = preparation_text_data(data)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  tokenized_text = [[nlp(i.lower().strip())] for i in tqdm(raw_text)]


  0%|          | 0/5665 [00:00<?, ?it/s]

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for doc in tqdm(tokenized_text):


  0%|          | 0/5665 [00:00<?, ?it/s]

In [28]:
#copies
data1 = processed_data.copy()
data2 = processed_data.copy()

Now train the deep learning model now that the text is in an appropriate format

In [29]:
##store headlines and labels in respective lists
text = list(data2['text'])
labels = list(data2['labels'])





total_samples = len(text)
train_samples = int(total_samples * 0.75) #75% for training
test_samples = total_samples - train_samples #25% testing

#training and testing sets
training_text = text[:train_samples]
testing_text = text[train_samples:]
training_labels = labels[:train_samples]
testing_labels = labels[train_samples:]

In [43]:
#preprocess
tokenizer = Tokenizer(num_words=10000, oov_token= "<OOV>")
tokenizer.fit_on_texts(training_text)
word_index = tokenizer.word_index
training_sequences = tokenizer.texts_to_sequences(training_text)
training_padded = pad_sequences(training_sequences, maxlen=120, padding='post', truncating='post')
testing_sequences = tokenizer.texts_to_sequences(testing_text)
testing_padded = pad_sequences(testing_sequences, maxlen=120, padding='post', truncating='post')
# convert lists into numpy arrays to make it work with TensorFlow
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [31]:
import keras
from keras import layers

In [44]:
model = keras.Sequential([
    keras.layers.Embedding(10000, 16, input_length=120),
    keras.layers.GlobalAveragePooling1D(),
    keras.layers.Dense(24, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])
##compile the model
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 200, 16)           160000    
                                                                 
 global_average_pooling1d_3  (None, 16)                0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dense_6 (Dense)             (None, 24)                408       
                                                                 
 dense_7 (Dense)             (None, 1)                 25        
                                                                 
Total params: 160433 (626.69 KB)
Trainable params: 160433 (626.69 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [45]:
num_epochs = 10
history = model.fit(training_padded,
                    training_labels,
                    epochs=num_epochs,
                    validation_data=(testing_padded, testing_labels),
                    verbose=2)

Epoch 1/10
133/133 - 1s - loss: 0.6874 - accuracy: 0.5589 - val_loss: 0.6885 - val_accuracy: 0.5469 - 1s/epoch - 8ms/step
Epoch 2/10
133/133 - 0s - loss: 0.6844 - accuracy: 0.5631 - val_loss: 0.6883 - val_accuracy: 0.5469 - 329ms/epoch - 2ms/step
Epoch 3/10
133/133 - 0s - loss: 0.6832 - accuracy: 0.5631 - val_loss: 0.6869 - val_accuracy: 0.5469 - 363ms/epoch - 3ms/step
Epoch 4/10
133/133 - 0s - loss: 0.6811 - accuracy: 0.5631 - val_loss: 0.6846 - val_accuracy: 0.5469 - 327ms/epoch - 2ms/step
Epoch 5/10
133/133 - 0s - loss: 0.6773 - accuracy: 0.5631 - val_loss: 0.6820 - val_accuracy: 0.5469 - 366ms/epoch - 3ms/step
Epoch 6/10
133/133 - 0s - loss: 0.6675 - accuracy: 0.5631 - val_loss: 0.6729 - val_accuracy: 0.5469 - 370ms/epoch - 3ms/step
Epoch 7/10
133/133 - 0s - loss: 0.6490 - accuracy: 0.5631 - val_loss: 0.6633 - val_accuracy: 0.5469 - 366ms/epoch - 3ms/step
Epoch 8/10
133/133 - 0s - loss: 0.6196 - accuracy: 0.6297 - val_loss: 0.6303 - val_accuracy: 0.6246 - 347ms/epoch - 3ms/step
Epo

In [40]:

#test on new headline
new_headline = ["Chargers' Jim Harbaugh calls J.J. McCarthy best QB in NFL draft"]
sequences = tokenizer.texts_to_sequences(new_headline)
padded_seqs = pad_sequences(sequences, maxlen=120, padding='post', truncating='post')
print(model.predict(padded_seqs))

[[0.7868246]]
