In [29]:
import csv
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [30]:
path = "/content/bbc-text.csv"

with open(path, "r") as z:
  print("Header looks like: ", z.readline())
  print("Data looks like: ", z.readline())

Header looks like:  category,text

Data looks like:  tech,tv future in the hands of viewers with home theatre systems  plasma high-definition tvs  and digital video recorders moving into the living room  the way people watch tv will be radically different in five years  time.  that is according to an expert panel which gathered at the annual consumer electronics show in las vegas to discuss how these new technologies will impact one of our favourite pastimes. with the us leading the trend  programmes and other content will be delivered to viewers via home networks  through cable  satellite  telecoms companies  and broadband service providers to front rooms and portable devices.  one of the most talked-about technologies of ces has been digital and personal video recorders (dvr and pvr). these set-top boxes  like the us s tivo and the uk s sky+ system  allow people to record  store  play  pause and forward wind tv programmes when they want.  essentially  the technology allows for much m

In [31]:
def remove_stopwords(sentence):
  stopwords = ["a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]
  sentence = sentence.lower()
  sentence = " ".join([w for w in sentence.split() if not w in stopwords])
  return sentence

In [32]:
remove_stopwords("I am about to go to the store and get any snack")

'go store get snack'

In [33]:
def parse_data(filename):
  labels = []
  sentences = []
  with open(filename, "r") as f:
    reader = csv.reader(f, delimiter=",")
    next(reader)
    for line in reader:
      labels.append(line[0])
      sentences.append(remove_stopwords(line[1]))

  return sentences, labels

In [34]:
sentences, labels = parse_data(path)

In [35]:
len(sentences), len(labels)

(2225, 2225)

In [36]:
len(sentences[0].split())

436

In [37]:
def tokenizer(sentences):
  tokenizer = Tokenizer(oov_token="<OOV>")
  tokenizer.fit_on_texts(sentences)
  return tokenizer

In [38]:
tokenizer = tokenizer(sentences)

In [41]:
def get_padded_sequences(tokenizer, sentences):
  sequences = tokenizer.texts_to_sequences(sentences)
  padded = pad_sequences(sequences)
  return padded

In [42]:
padded = get_padded_sequences(tokenizer, sentences)

In [43]:
padded[0]

array([  0,   0,   0, ..., 949,  87,  87], dtype=int32)

In [48]:
def tokenize_labels(labels):
  label_tokenizer = Tokenizer()
  label_tokenizer.fit_on_texts(labels)
  widx = label_tokenizer.word_index
  label_seq = label_tokenizer.texts_to_sequences(labels)
  return label_seq, widx

In [49]:
label_seq, widx = tokenize_labels(labels)

In [50]:
widx

{'sport': 1, 'business': 2, 'politics': 3, 'tech': 4, 'entertainment': 5}

In [53]:
label_seq[:10]

[[4], [2], [1], [1], [5], [3], [3], [1], [1], [5]]