# Import Libaries
In this notebook sample data is improted and cleaned. The new cleaned data is then divided into different types of text representation: BOW, Text Vectors and raw text.

In [23]:
#from google.colab import drive, files

import pandas as pd

#word cleaning
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re

#data prep and dl modelling
import tensorflow as tf


#save encoded data
from numpy import savetxt

#used to count words
from collections import Counter

In [2]:
nltk.download('stopwords') #to remove common words
nltk.download('wordnet') #for WordNetLemmatizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
#get data from google drive
#drive.mount('/content/drive')

Mounted at /content/drive


# Utility Functions

In [11]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text): #ref:https://towardsdatascience.com/multi-class-text-classification-with-lstm-1590bee1bd17
    """
      outputs a cleaned string of text from an input string of text
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing.
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwords from text
    lemmatizer.lemmatize(text) # reduce to root word
    return text

truncate = 128
def truncate_text(text):
    text = text[:truncate] # reduce string length

# Import Sample

In [4]:
df = pd.read_csv('/content/drive/MyDrive/data/guardian_articles_10_perc.csv')

In [5]:
df['webTitle'] = df['webTitle'].astype(str)

In [6]:
df['bodyContent'] = df['bodyContent'].astype(str)

In [7]:
df['sectionName'] = df['sectionName'].astype(str)

In [8]:
df.webTitle[50]

'How do I ... know if I have a mental illness?'

In [9]:
df.bodyContent[50]

'Mental health problems affect one in four of us at any one time. Though accurate figures can be difficult to obtain, it is estimated that 450 million people worldwide have a mental health problem. What is mental illness? There are more than 200 clinically diagnosable mental health conditions, very roughly organised into five major categories. These are: mood disorders, anxiety disorders, schizophrenia and psychotic disorders, eating disorders and dementia. Depression is the most common mental illness. The World Health Organisation estimates that by 2020 depression will be the second leading cause of disability globally, after heart disease. Other common mental illnesses include: general anxiety disorder, bipolar disorder, schizophrenia and anorexia. Lesser known, but just as debilitating, conditions include trichotillomania (a compulsion to pull out one’s hair) and pica (the eating of non-edible items). Different conditions are more prevalent in different parts of the globe. Obsessive

In [10]:
df.sectionName[50]

'UK news'

# Clean Sample Data

In [12]:
df.iloc[:, 0] = df.iloc[:,0].apply(clean_text)

In [13]:
df.iloc[:, 1] = df.iloc[:,1].apply(clean_text)

In [14]:
df.iloc[:, 2] = df.iloc[:,2].apply(clean_text)

In [15]:
df.webTitle[50]

'know mental illness'

In [16]:
df.bodyContent[50]

'mental health problems affect one four us one time though accurate figures difficult obtain estimated 450 million people worldwide mental health problem mental illness 200 clinically diagnosable mental health conditions roughly organised five major categories mood disorders anxiety disorders schizophrenia psychotic disorders eating disorders dementia depression common mental illness world health organisation estimates 2020 depression second leading cause disability globally heart disease common mental illnesses include general anxiety disorder bipolar disorder schizophrenia anorexia lesser known debilitating conditions include trichotillomania compulsion pull ones hair pica eating nonedible items different conditions prevalent different parts globe obsessive compulsive disorder three times likely occur latin america africa japan high rates schizophrenia know unwell normal feel sad angry upset frustrated confused know experiencing indicative serious problem starts screw life says simon

In [17]:
df.sectionName[50]

'uk news'

In [18]:
df["webTitle_bodyContent"] = df['webTitle'].astype(str) +" "+ df['bodyContent'].astype(str)

In [19]:
#drop the columns we don't need for further analysis/modelling
df = df.drop(columns=['bodyContent'])

In [20]:
#reorder columns so we have our features X and our target y
df = df[['webTitle', 'webTitle_bodyContent', 'sectionName']]

In [21]:
df.head()

Unnamed: 0,webTitle,webTitle_bodyContent,sectionName
0,saido berahino right attitude hes fit says wes...,saido berahino right attitude hes fit says wes...,football
1,angelique kerber aims dislodge serena williams...,angelique kerber aims dislodge serena williams...,sport
2,family building refugee shadow isis,family building refugee shadow isis 9 june 201...,world news
3,exeter keep saracens sights bonus point win wo...,exeter keep saracens sights bonus point win wo...,sport
4,exposed photographys fabulous fakes,exposed photographys fabulous fakes 1840 hippo...,art design


# Explore Data

In [37]:
# To define our vocab we need to count how many unique words are in the dataset.
# since we combine webTitle and bodyContent we can use the combined column
feature_words = Counter()
df['webTitle'].str.lower().str.split().apply(feature_words.update)

max_feature_words = len(feature_words)
print("webTitle vocab size: " + str(max_feature_words))

webTitle vocab size: 23681


In [38]:
feature_words = Counter()
df['webTitle_bodyContent'].str.lower().str.split().apply(feature_words.update)

max_feature_words = len(feature_words)
print("webTitle_bodyContent vocab size: " + str(max_feature_words))

webTitle_bodyContent vocab size: 200451


In [30]:
target_words = Counter()
df['sectionName'].str.lower().str.split().apply(target_words.update)

max_target_classes = len(target_words)
print("class size: " + str(max_target_classes))

class size: 121


In [36]:
# we have 14,983 instances
len(df['sectionName'])

14983

In [31]:
# Find how many words per row a.k.a the sequence length, but exclude counting spaces
# we will use this to pad the output
seq_len_wt = int(df.iloc[:, 0].map(len).max())
seq_len_wt_bc = int(df.iloc[:, 1].map(len).max())
print("WebTitle: " + str(seq_len_wt) + ", webTitlebodyContent: " + str(seq_len_wt_bc)) #max length of each example

WebTitle: 128, webTitlebodyContent: 59755


# Save Cleaned Datasets
Below we save our cleaned datasets to disk so we can reuse them later when building our models. Unfortunately these sets are much larger than the orginal 10% sample we took. This is one of the drawbacks of text representation such as bag-of-words. Code for saving these files were all ran individually and then commented out to not exceed colab free account restrictions.

In [32]:
# Save the pandas DataFrame to a CSV file
df.webTitle.to_csv('./data/X_raw_wt.csv', index=False)#822.4kb #/content/drive/MyDrive

In [33]:
# Save the pandas DataFrame to a CSV file
df.webTitle_bodyContent.to_csv('./data/X_raw_wt_bc.csv', index=False) #/content/drive/MyDrive/

In [34]:
# Save the pandas DataFrame to a CSV file
df.sectionName.to_csv('./data/y_raw.csv', index=False) #/content/drive/MyDrive

## Text to Sequence

In [None]:
wt_sequences = encode_wt.texts_to_sequences(df.webTitle.values)
X_t2s_wt = pad_sequences(wt_sequences, padding='post', truncating='post', maxlen=seq_len_wt)

In [None]:
X_t2s_wt

array([[11115, 11116,   118, ...,     0,     0,     0],
       [11119, 11120,  1430, ...,     0,     0,     0],
       [  105,  1775,   534, ...,     0,     0,     0],
       ...,
       [ 3954,   215,   252, ...,     0,     0,     0],
       [ 1536,  4797,    42, ...,     0,     0,     0],
       [11006,   911, 10361, ...,     0,     0,     0]], dtype=int32)

In [None]:
X_t2s_wt.shape

(14983, 128)

In [None]:
wt_bc_sequences = encode_wt_bc.texts_to_sequences(df.webTitle_bodyContent.values)
X_t2s_wt_bc = pad_sequences(wt_bc_sequences, padding='post', truncating='post', maxlen=seq_len_wt_bc)

In [None]:
X_t2s_wt_bc

array([[42542, 37092,    62, ...,     0,     0,     0],
       [26864, 22351,  3413, ...,     0,     0,     0],
       [   79,   576,  2128, ...,     0,     0,     0],
       ...,
       [11541,   662,  1212, ...,     0,     0,     0],
       [ 3002, 10330,   206, ...,     0,     0,     0],
       [12301,   211, 24372, ...,     0,     0,     0]], dtype=int32)

In [None]:
X_t2s_wt_bc.shape

(14983, 59755)