# Cleaning Data Scraped From TowardsDataScience.com

In [1]:
import pandas as pd
import numpy as np
import os

import warnings
warnings.filterwarnings('ignore')

#### Pull In Scraped Data

In [2]:
raw_files = os.listdir('data/raw/')

In [3]:
try:
    raw_files.remove('.ipynb_checkpoints')
except:
    pass

#### Clean Scraped Data

In [4]:
# a function to convert K claps to an integer
def replace_K(string_in): 
    if string_in.count('K') >0:
        return int((float(string_in.replace('K', '')) * 1000))
    else: 
        return int(string_in)

#### Convert to DataFrame

In [5]:
def process_raw(file):
    df = pd.read_csv(file, sep='\t')
    # convert K claps to integer
    df['claps'] = df['claps'].apply(replace_K)
    return df


In [6]:
df_list = [process_raw(f'./data/raw/{file}') for file in raw_files]

#### Concatenate Files Into One Large DataFrame

In [7]:
df = pd.concat(df_list, ignore_index=True)

#### Save Cleaned File

In [8]:
output_filename = 'data/cleaned_with_outliers.csv'

In [9]:
df.to_csv(output_filename, index=False)

#### Create Log Claps

In [10]:
#prior to removing the outliers, create a column of log_claps, and save a version of the df that will use log_claps as the target

df['log_claps'] = np.log(df['claps'])

In [11]:
output_filename = 'data/cleaned_with_log_claps.csv'
df.to_csv(output_filename, index=False)

In [12]:
#drop the log_claps column before removing outliers 
df.drop(columns = 'log_claps', inplace=True)

#### Remove Outliers

In [13]:
num_columns = ['number_paragraphs', 'reading_time (mins)', 'number_sections', 'claps']

In [14]:
for col in num_columns:
    df = df[(df[col] <= ((df[col].mean())+(3*(df[col].std()))))]

In [15]:
output_filename = 'data/cleaned_no_outliers.csv'
df.to_csv(output_filename, index=False)