# Preprocessing Cleaned TDS Data for NLP Analysis 
### *Log_Claps as Target Variable*

In [1]:
import pandas as pd
import numpy as np

#nlp
import nltk
from nltk.tokenize import word_tokenize, RegexpTokenizer, WhitespaceTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction import text
import re

import warnings
warnings.filterwarnings('ignore')

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rachelinsler/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
filename = 'data/cleaned_with_log_claps.csv'

In [4]:
df = pd.read_csv(filename)

In [5]:
df.head()

Unnamed: 0,date,title,subtitle,claps,responses,author_url,story_url,reading_time (mins),number_sections,section_titles,number_paragraphs,paragraphs,log_claps
0,01/01/2021,7 Most Recommended Skills to Learn in 2021 to ...,Recommended by some of the largest…,1000,10,https://towardsdatascience.com/@terenceshin,https://towardsdatascience.com/7-most-recommen...,6,11,['7 Most Recommended Skills to Learn in 2021 t...,36,"['Terence Shin', 'Jan 1·6 min read', 'Happy Ne...",6.907755
1,01/01/2021,The Ultimate Guide to Acing Coding Interviews ...,Data Science Interview,489,4,https://towardsdatascience.com/@emmading,https://towardsdatascience.com/the-ultimate-gu...,11,12,['The Ultimate Guide to Acing Coding Interview...,42,"['Emma Ding', 'Jan 1·11 min read', 'Written by...",6.192362
2,01/01/2021,Shakespeare versus Eminem— who’s the better ly...,"He is known for his poetry, his writings on life…",139,2,https://towardsdatascience.com/@jeroenvanzeeland,https://towardsdatascience.com/shakespeare-ver...,9,13,['Shakespeare versus Eminem—who’s the better l...,64,"['Jeroen van Zeeland', 'Jan 1·9 min read', 'Da...",4.934474
3,01/01/2021,Customer Segmentation in Online Retail,A detailed step-by-step explanation on perform...,159,1,https://towardsdatascience.com/@rahulkhandelwal,https://towardsdatascience.com/customer-segmen...,19,15,"['Customer Segmentation in Online Retail', 'Un...",93,"['Rahul Khandelwal', 'Jan 1·19 min read', 'In ...",5.068904
4,01/01/2021,Implementing VisualTtransformer in PyTorch,"Hi guys, happy new year! Today we are going to...",133,2,https://towardsdatascience.com/@FrancescoZ,https://towardsdatascience.com/implementing-vi...,6,6,['Implementing Vision Transformer (ViT) in PyT...,60,"['Francesco Zuppichini', 'Jan 1·6 min read', '...",4.890349


In [6]:
df.columns

Index(['date', 'title', 'subtitle', 'claps', 'responses', 'author_url',
       'story_url', 'reading_time (mins)', 'number_sections', 'section_titles',
       'number_paragraphs', 'paragraphs', 'log_claps'],
      dtype='object')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9804 entries, 0 to 9803
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   date                 9804 non-null   object 
 1   title                9804 non-null   object 
 2   subtitle             9804 non-null   object 
 3   claps                9804 non-null   int64  
 4   responses            9804 non-null   object 
 5   author_url           9804 non-null   object 
 6   story_url            9804 non-null   object 
 7   reading_time (mins)  9804 non-null   int64  
 8   number_sections      9804 non-null   int64  
 9   section_titles       9804 non-null   object 
 10  number_paragraphs    9804 non-null   int64  
 11  paragraphs           9804 non-null   object 
 12  log_claps            9804 non-null   float64
dtypes: float64(1), int64(4), object(8)
memory usage: 995.8+ KB


In [8]:
#remove non-text, non-Y columns
#in this case, we are modeling log_claps so we can remove claps
df = df.drop(columns=['date', 'responses', 'author_url',
       'story_url', 'reading_time (mins)', 'number_sections', 
       'number_paragraphs', 'claps'])

#### Combine Title & Subtitle

In [9]:
df['combined_title'] = (df['title'] + ' ' + df['subtitle'])

In [10]:
df = df.drop(columns = ['title', 'subtitle']).reset_index(drop=True)

#### Remove Punctuation & Make Lowercase

In [11]:
text_cols = ['section_titles', 'paragraphs', 'combined_title']

In [12]:
for col in text_cols:
    df[col] = df[col].map(lambda x: re.sub("[''·,\.!?'-]", '', x).lower())

#### Remove the 'Min Read' Phrase at the Beginning of Each Paragraph

In [13]:
#remove all text up to and including 'min read' in the beginning of the paragraphs
df['paragraphs'] = df['paragraphs'].map(lambda x: (x[x.index('min read')+9:] if (x.count('min read') > 0) else x))
                     

#### Remove the Brackets Surrounding The Section Titles

In [14]:
#remove brackets surrounding the section titles
df['section_titles'] = df['section_titles'].map(lambda x: x.replace("[",""))
df['section_titles'] = df['section_titles'].map(lambda x: x.replace("]",""))

In [15]:
df.head()

Unnamed: 0,section_titles,paragraphs,log_claps,combined_title
0,7 most recommended skills to learn in 2021 to ...,happy new year to kick off 2021 i wanted to sh...,6.907755,7 most recommended skills to learn in 2021 to ...
1,the ultimate guide to acing coding interviews ...,written by emma ding and rob wang data science...,6.192362,the ultimate guide to acing coding interviews ...
2,shakespeare versus eminem—who’s the better lyr...,data science has crept into every conceivable ...,4.934474,shakespeare versus eminem— who’s the better ly...
3,customer segmentation in online retail underst...,in this article i am going to write about how ...,5.068904,customer segmentation in online retail a detai...
4,implementing vision transformer (vit) in pytor...,hi guys happy new year today we are going to i...,4.890349,implementing visualttransformer in pytorch hi ...


#### Tokenize Text

In [16]:
w_tokenizer = WhitespaceTokenizer()

In [17]:
for col in text_cols:
    df[col] = df[col].apply(lambda x: w_tokenizer.tokenize(x))

In [18]:
df.head()

Unnamed: 0,section_titles,paragraphs,log_claps,combined_title
0,"[7, most, recommended, skills, to, learn, in, ...","[happy, new, year, to, kick, off, 2021, i, wan...",6.907755,"[7, most, recommended, skills, to, learn, in, ..."
1,"[the, ultimate, guide, to, acing, coding, inte...","[written, by, emma, ding, and, rob, wang, data...",6.192362,"[the, ultimate, guide, to, acing, coding, inte..."
2,"[shakespeare, versus, eminem—who’s, the, bette...","[data, science, has, crept, into, every, conce...",4.934474,"[shakespeare, versus, eminem—, who’s, the, bet..."
3,"[customer, segmentation, in, online, retail, u...","[in, this, article, i, am, going, to, write, a...",5.068904,"[customer, segmentation, in, online, retail, a..."
4,"[implementing, vision, transformer, (vit), in,...","[hi, guys, happy, new, year, today, we, are, g...",4.890349,"[implementing, visualttransformer, in, pytorch..."


#### Remove Special Characters 

In [19]:
for col in text_cols:
    df[col] = df[col].replace('[^\w]','',regex=True)

In [20]:
df.head()

Unnamed: 0,section_titles,paragraphs,log_claps,combined_title
0,"[7, most, recommended, skills, to, learn, in, ...","[happy, new, year, to, kick, off, 2021, i, wan...",6.907755,"[7, most, recommended, skills, to, learn, in, ..."
1,"[the, ultimate, guide, to, acing, coding, inte...","[written, by, emma, ding, and, rob, wang, data...",6.192362,"[the, ultimate, guide, to, acing, coding, inte..."
2,"[shakespeare, versus, eminem—who’s, the, bette...","[data, science, has, crept, into, every, conce...",4.934474,"[shakespeare, versus, eminem—, who’s, the, bet..."
3,"[customer, segmentation, in, online, retail, u...","[in, this, article, i, am, going, to, write, a...",5.068904,"[customer, segmentation, in, online, retail, a..."
4,"[implementing, vision, transformer, (vit), in,...","[hi, guys, happy, new, year, today, we, are, g...",4.890349,"[implementing, visualttransformer, in, pytorch..."


#### Remove English Stopwords

In [21]:
stop_words = set(stopwords.words('english')) 

In [22]:
def remove_stopwords(text):
    return [word for word in text if word not in stop_words]

In [23]:
for col in text_cols:
    df[col] = df[col].apply(remove_stopwords)

#### Lemmatize

Uncomment this section to save the file with lemmatized text.

In [24]:
# def lemmatize_text(text):
#     lemmatizer = WordNetLemmatizer()
#     return [lemmatizer.lemmatize(w) for w in text]

In [25]:
# for col in text_cols:
#     df[col] = df[col].apply(lemmatize_text)

In [26]:
# save as lemmatized
# output_filename = 'data/nlp_nltk_lemmatized_preproc_log.csv'

#### Porter Stemmer
Uncomment this section to save the file with stemmed text.

In [27]:
# def stem_text(text):
#     stemmer = PorterStemmer()
#     return [stemmer.stem(w) for w in text]

In [28]:
# for col in text_cols:
#     df[col] = df[col].apply(stem_text)

In [29]:
# output_filename = 'data/nlp_nltk_stemmed_preproc_log.csv'

#### No Stemming
Uncomment this section to save the file without any lemmatization or stemming

In [30]:
#output filename for no lemmatizing or stemming
output_filename = 'data/nlp_nltk_nostem_preproc_strings_log.csv'

#### Combine Text Columns

In [31]:
df['text'] = df['combined_title'] + df['section_titles'] + df['paragraphs']

In [32]:
tds = df.drop(columns = ['section_titles', 'paragraphs', 'combined_title'])

In [33]:
tds.head()

Unnamed: 0,log_claps,text
0,6.907755,"[7, recommended, skills, learn, 2021, data, sc..."
1,6.192362,"[ultimate, guide, acing, coding, interviews, d..."
2,4.934474,"[shakespeare, versus, eminem—, who’s, better, ..."
3,5.068904,"[customer, segmentation, online, retail, detai..."
4,4.890349,"[implementing, visualttransformer, pytorch, hi..."


#### Save Pre-Processed File

In [34]:
tds.to_csv(output_filename)