# Towards Data Science Blog Posts: Data Cleaning
### *Cleaning Data Scraped From TowardsDataScience.com*

In [34]:
import pandas as pd
import numpy as np
import os

import warnings
warnings.filterwarnings('ignore')

#### Pull In Scraped Data

In [35]:
raw_files = os.listdir('../data/raw/')

In [36]:
try:
    raw_files.remove('.ipynb_checkpoints')
except:
    pass

#### Clean Scraped Data

In [37]:
# a function to convert K claps to an integer
def replace_K(string_in): 
    if string_in.count('K') >0:
        return int((float(string_in.replace('K', '')) * 1000))
    else: 
        return int(string_in)

#### Convert to DataFrame

In [38]:
def process_raw(file):
    df = pd.read_csv(file, sep='\t')
    # convert K claps to integer
    df['claps'] = df['claps'].apply(replace_K)
    return df


In [39]:
df_list = [process_raw(f'../data/raw/{file}') for file in raw_files]

#### Concatenate Files Into One Large DataFrame

In [40]:
df = pd.concat(df_list, ignore_index=True)

In [41]:
df.shape

(9804, 12)

#### Save Cleaned File

In [32]:
output_filename = '../data/tds_cleaned.csv'

In [33]:
df.to_csv(output_filename, index=False)

In [52]:
pd.read_csv('../data/raw/2021_04.csv', sep='\t').drop(columns = ['author_url', 'story_url']).head(10)

Unnamed: 0,date,title,subtitle,claps,responses,reading_time (mins),number_sections,section_titles,number_paragraphs,paragraphs
0,01/01/2021,7 Most Recommended Skills to Learn in 2021 to ...,Recommended by some of the largest…,1K,10,6,11,['7 Most Recommended Skills to Learn in 2021 t...,36,"['Terence Shin', 'Jan 1·6 min read', 'Happy Ne..."
1,01/01/2021,The Ultimate Guide to Acing Coding Interviews ...,Data Science Interview,489,4,11,12,['The Ultimate Guide to Acing Coding Interview...,42,"['Emma Ding', 'Jan 1·11 min read', 'Written by..."
2,01/01/2021,Shakespeare versus Eminem— who’s the better ly...,"He is known for his poetry, his writings on life…",139,2,9,13,['Shakespeare versus Eminem—who’s the better l...,64,"['Jeroen van Zeeland', 'Jan 1·9 min read', 'Da..."
3,01/01/2021,Customer Segmentation in Online Retail,A detailed step-by-step explanation on perform...,159,1,19,15,"['Customer Segmentation in Online Retail', 'Un...",93,"['Rahul Khandelwal', 'Jan 1·19 min read', 'In ..."
4,01/01/2021,Implementing VisualTtransformer in PyTorch,"Hi guys, happy new year! Today we are going to...",133,2,6,6,['Implementing Vision Transformer (ViT) in PyT...,60,"['Francesco Zuppichini', 'Jan 1·6 min read', '..."
5,01/01/2021,Stock Price Analysis with Pandas and Altair,Practical guide for Pandas and Altair,92,0 responses,5,1,['Stock Price Analysis with Pandas and Altair'],29,"['Soner Yıldırım', 'Jan 1·5 min read', 'Stock ..."
6,01/01/2021,Optimal Threshold for Imbalanced Classification,Hands-on Tutorial,58,1,9,1,['Optimal Threshold for Imbalanced Classificat...,40,"['Audhi Aprilliant', 'Jan 1·9 min read', 'Clas..."
7,01/01/2021,Creating Abstract Art with StyleGAN2 ADA,How I used Adaptive Discriminator Augmentation...,107,2,11,9,"['Creating Abstract Art with StyleGAN2 ADA', '...",52,"['Robert A. Gonsalves', 'Jan 1·11 min read', ""..."
8,01/01/2021,Natural Language Generation Part 2: GPT2 and H...,Learn to use Huggingface and GPT-2 to train a…,24,0 responses,8,1,['Natural Language Generation Part 2: GPT2 and...,26,"['George Dittmar', 'Jan 1·8 min read', 'So it’..."
9,01/01/2021,Underspecification: The Dangerously Underdiscu...,-,155,2,7,1,['Underspecification: The Dangerously Underdis...,35,"['Andre Ye', 'Jan 1·7 min read', 'All machine ..."
