# Preprocessing Cleaned TDS Data for NLP Analysis 
### *Claps as Target Variable*

In [1]:
import pandas as pd
import numpy as np


#nlp
import nltk
from nltk.tokenize import word_tokenize, RegexpTokenizer, WhitespaceTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction import text
import re

import warnings
warnings.filterwarnings('ignore')

In [2]:
#nltk.download('stopwords')

In [3]:
filename = '../data/tds_cleaned_no_outliers.csv'

In [4]:
df = pd.read_csv(filename).drop(columns = 'Unnamed: 0')

In [5]:
df.head()

Unnamed: 0,date,title,subtitle,claps,responses,author_url,story_url,reading_time (mins),number_sections,section_titles,number_paragraphs,paragraphs,len_paragraphs,len_title,days_live
0,2020-10-01,Introduction to System Design — 1,"In the series of post on System Design, I am w...",14,0,https://towardsdatascience.com/@mohitdtumce,https://towardsdatascience.com/introduction-to...,4,2,"['Horizontal Scaling vs Vertical Scaling', 'In...",22,"['Mohit Sharma', 'Oct 1, 2020·4 min read', 'In...",3667,33,212
1,2020-10-01,How To Build And Deploy Your Dashboard With Py...,Hands-on Tutorials,203,2,https://towardsdatascience.com/@matgonzalez,https://towardsdatascience.com/how-to-build-an...,5,1,['How To Build And Deploy Your Dashboard With ...,27,"['Matt', 'Oct 1, 2020·5 min read', 'The visual...",4916,77,212
2,2020-10-01,The Magic of Python Context Managers,Getting Started,360,3,https://towardsdatascience.com/@martin.heinz,https://towardsdatascience.com/the-magic-of-py...,8,13,"['The Magic of Python Context Managers', 'What...",46,"['Martin Heinz', 'Oct 1, 2020·8 min read', 'Re...",11644,36,212
3,2020-10-01,Generating Image Segmentation Masks — The Easy...,…in under 5 Minutes,88,1,https://towardsdatascience.com/@abhiroop.talasila,https://towardsdatascience.com/generating-imag...,5,4,['Generating Image Segmentation Masks — The Ea...,19,"['Abhiroop Talasila', 'Oct 1, 2020·5 min read'...",3817,50,212
4,2020-10-01,How to go from Bayes’Theorem to Bayesian Infer...,-,105,1,https://towardsdatascience.com/@jimip6c12,https://towardsdatascience.com/how-to-go-from-...,10,5,['How to go from Bayes’Theorem to Bayesian Inf...,49,"['JimSpark', 'Oct 1, 2020·10 min read', 'When ...",13662,50,212


In [6]:
df.columns

Index(['date', 'title', 'subtitle', 'claps', 'responses', 'author_url',
       'story_url', 'reading_time (mins)', 'number_sections', 'section_titles',
       'number_paragraphs', 'paragraphs', 'len_paragraphs', 'len_title',
       'days_live'],
      dtype='object')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9290 entries, 0 to 9289
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   date                 9290 non-null   object
 1   title                9290 non-null   object
 2   subtitle             9290 non-null   object
 3   claps                9290 non-null   int64 
 4   responses            9290 non-null   int64 
 5   author_url           9290 non-null   object
 6   story_url            9290 non-null   object
 7   reading_time (mins)  9290 non-null   int64 
 8   number_sections      9290 non-null   int64 
 9   section_titles       9290 non-null   object
 10  number_paragraphs    9290 non-null   int64 
 11  paragraphs           9290 non-null   object
 12  len_paragraphs       9290 non-null   int64 
 13  len_title            9290 non-null   int64 
 14  days_live            9290 non-null   int64 
dtypes: int64(8), object(7)
memory usage: 1.1+ MB


In [8]:
#remove non-text, non-Y columns
df = df.drop(columns=['date', 'responses', 'author_url',
       'story_url', 'reading_time (mins)', 'number_sections', 
       'number_paragraphs'])

#### Combine Title & Subtitle

In [9]:
df['combined_title'] = (df['title'] + ' ' + df['subtitle'])

In [10]:
df = df.drop(columns = ['title', 'subtitle']).reset_index(drop=True)

#### Remove Punctuation & Make Lowercase

In [11]:
text_cols = ['section_titles', 'paragraphs', 'combined_title']

In [12]:
for col in text_cols:
    df[col] = df[col].map(lambda x: re.sub("[''·,\.!?'-]", '', x).lower())

#### Remove the 'Min Read' Phrase at the Beginning of Each Paragraph

In [13]:
#remove all text up to and including 'min read' in the beginning of the paragraphs
df['paragraphs'] = df['paragraphs'].map(lambda x: (x[x.index('min read')+9:] if (x.count('min read') > 0) else x))
                     

#### Remove the Brackets Surrounding The Section Titles

In [14]:
#remove brackets surrounding the section titles
df['section_titles'] = df['section_titles'].map(lambda x: x.replace("[",""))
df['section_titles'] = df['section_titles'].map(lambda x: x.replace("]",""))

In [15]:
df.head()

Unnamed: 0,claps,section_titles,paragraphs,len_paragraphs,len_title,days_live,combined_title
0,14,horizontal scaling vs vertical scaling in hous...,in the series of posts on system design i am w...,3667,33,212,introduction to system design — 1 in the serie...
1,203,how to build and deploy your dashboard with py...,the visualization of the data allows us to qui...,4916,77,212,how to build and deploy your dashboard with py...
2,360,the magic of python context managers what is c...,resource management is one of those things you...,11644,36,212,the magic of python context managers getting s...
3,88,generating image segmentation masks — the easy...,if you’re reading this then you probably know ...,3817,50,212,generating image segmentation masks — the easy...
4,105,how to go from bayes’theorem to bayesian infer...,when i was a statistics rookie and tried to le...,13662,50,212,how to go from bayes’theorem to bayesian infer...


In [16]:
df['paragraphs'][15][:1000]

'those working with neural networks know how complicated object detection techniques can be it is no wonder there is no straight forward resource for training them you are always required to convert your data to a cocolike json or some other unwanted format it is never a plug and play experience moreover no diagram thoroughly explains faster rcnn or yolo as there is for unet or resnet there are just too many details while these models are quite messy the explanation for their lack of simplicity is quite straight forward it fits in a single sentence: neural networks have fixedsized outputs in object detection you can’t know a priori how many objects there are in a scene there might be one two twelve or none the following images all have the same resolution but feature different numbers of objects the one million dollar question is: how can we build variablesized outputs out of fixedsized networks plus how are we supposed to train a variable number of answers and loss terms how can we pe

#### Tokenize Text

In [17]:
w_tokenizer = WhitespaceTokenizer()

In [18]:
for col in text_cols:
    df[col] = df[col].apply(lambda x: w_tokenizer.tokenize(x))

In [19]:
df.head()

Unnamed: 0,claps,section_titles,paragraphs,len_paragraphs,len_title,days_live,combined_title
0,14,"[horizontal, scaling, vs, vertical, scaling, i...","[in, the, series, of, posts, on, system, desig...",3667,33,212,"[introduction, to, system, design, —, 1, in, t..."
1,203,"[how, to, build, and, deploy, your, dashboard,...","[the, visualization, of, the, data, allows, us...",4916,77,212,"[how, to, build, and, deploy, your, dashboard,..."
2,360,"[the, magic, of, python, context, managers, wh...","[resource, management, is, one, of, those, thi...",11644,36,212,"[the, magic, of, python, context, managers, ge..."
3,88,"[generating, image, segmentation, masks, —, th...","[if, you’re, reading, this, then, you, probabl...",3817,50,212,"[generating, image, segmentation, masks, —, th..."
4,105,"[how, to, go, from, bayes’theorem, to, bayesia...","[when, i, was, a, statistics, rookie, and, tri...",13662,50,212,"[how, to, go, from, bayes’theorem, to, bayesia..."


#### Remove Special Characters 

In [20]:
for col in text_cols:
    df[col] = df[col].replace('[^\w]','',regex=True)

In [21]:
df.head()

Unnamed: 0,claps,section_titles,paragraphs,len_paragraphs,len_title,days_live,combined_title
0,14,"[horizontal, scaling, vs, vertical, scaling, i...","[in, the, series, of, posts, on, system, desig...",3667,33,212,"[introduction, to, system, design, —, 1, in, t..."
1,203,"[how, to, build, and, deploy, your, dashboard,...","[the, visualization, of, the, data, allows, us...",4916,77,212,"[how, to, build, and, deploy, your, dashboard,..."
2,360,"[the, magic, of, python, context, managers, wh...","[resource, management, is, one, of, those, thi...",11644,36,212,"[the, magic, of, python, context, managers, ge..."
3,88,"[generating, image, segmentation, masks, —, th...","[if, you’re, reading, this, then, you, probabl...",3817,50,212,"[generating, image, segmentation, masks, —, th..."
4,105,"[how, to, go, from, bayes’theorem, to, bayesia...","[when, i, was, a, statistics, rookie, and, tri...",13662,50,212,"[how, to, go, from, bayes’theorem, to, bayesia..."


#### Remove English Stopwords

In [22]:
stop_words = set(stopwords.words('english')) 

In [23]:
def remove_stopwords(text):
    return [word for word in text if word not in stop_words]

In [24]:
for col in text_cols:
    df[col] = df[col].apply(remove_stopwords)

#### Lemmatize

Uncomment this section to save the file with lemmatized text.

In [25]:
# def lemmatize_text(text):
#     lemmatizer = WordNetLemmatizer()
#     return [lemmatizer.lemmatize(w) for w in text]

In [26]:
# for col in text_cols:
#     df[col] = df[col].apply(lemmatize_text)

In [27]:
# save as lemmatized
# output_filename = '../data/tds_nltk_lemmatized_preproc.csv'

#### Porter Stemmer
Uncomment this section to save the file with stemmed text.

In [28]:
stemmer = PorterStemmer()
def stem_text(text):
    return [stemmer.stem(w) for w in text]

In [29]:
for col in text_cols:
    df[col] = df[col].apply(stem_text)

In [37]:
output_filename = '../data/tds_nltk_stemmed_preproc.csv'

#### No Stemming
Uncomment this section to save the file without any lemmatization or stemmed. 

In [31]:
# #output filename for no lemmatizing or stemming
# output_filename = '../data/tds_nltk_nostem_preproc_strings.csv'

#### Combine Text Columns

In [32]:
df['text'] = df['combined_title'] + df['section_titles'] + df['paragraphs']

In [33]:
tds = df.drop(columns = ['section_titles', 'paragraphs', 'combined_title'])

In [34]:
tds['text'] = [' '.join(x) for x in tds['text']]

In [35]:
tds.head()

Unnamed: 0,claps,len_paragraphs,len_title,days_live,text
0,14,3667,33,212,introduct system design — 1 seri post system d...
1,203,4916,77,212,build deploy dashboard python googl sheet vuej...
2,360,11644,36,212,magic python context manag get start magic pyt...
3,88,3817,50,212,gener imag segment mask — easi way …in 5 minut...
4,105,13662,50,212,go bayes’theorem bayesian infer go bayes’theor...


#### Save Pre-Processed File

In [38]:
tds.to_csv(output_filename)

In [41]:
tds['text'][15][104:1100]

'work neural network know complic object detect techniqu wonder straight forward resourc train alway requir convert data cocolik json unwant format never plug play experi moreov diagram thoroughli explain faster rcnn yolo unet resnet mani detail model quit messi explan lack simplic quit straight forward fit singl sentence: neural network fixeds output object detect can’t know priori mani object scene might one two twelv none follow imag resolut featur differ number object one million dollar question is: build variables output fixeds network plu suppos train variabl number answer loss term penal wrong predict creat output vari size two approach domin literature: “one size fit all” approach output broad suffic applic “lookahead” idea search regionsofinterest classifi made term 😄 practic known “onestage” “twostage” approach tad less selfexplanatori overfeat yolo ssd retinanet etc can’t variables output shall return output larg alway larger need prune excess whole idea take greedi rout '