# Data Cleaning - comments.tsv

Description:
- cleaned comments dataset ready for VaderSentiment and TextBlob

## Import Libraries

### Main Libraries

In [1]:
import pandas as pd
import numpy as np
import re

## Load Dataset

In [2]:
# Load dataset
df = pd.read_csv('C:/Users/cherryb/Desktop/Personal Projects/Datasets/Telus - Fintech/comments.txt', sep='\t')
# Inspect df
df.head(3)

Unnamed: 0,position,post_id,post_by,post_text,post_published,comment_id,comment_by,is_reply,comment_message,comment_published,comment_like_count,attachment_type,attachment_url
0,20_0,155027942462_10157280442467463,f0a137b6d4a4b4e94cc04bea6ac43d203c12d679,Let us help keep you in control of your money ...,2019-05-31T13:42:05+0000,10157280442467463_10157324996792463,da39a3ee5e6b4b0d3255bfef95601890afd80709,1,Why?,2019-06-18T00:08:26+0000,0,,
1,20_1,155027942462_10157280442467463,f0a137b6d4a4b4e94cc04bea6ac43d203c12d679,Let us help keep you in control of your money ...,2019-05-31T13:42:05+0000,10157280442467463_10157291740937463,da39a3ee5e6b4b0d3255bfef95601890afd80709,0,The worst card ever it’s lawsuit time,2019-06-04T19:53:28+0000,6,,
2,32_0,155027942462_10157236349992463,f0a137b6d4a4b4e94cc04bea6ac43d203c12d679,Rise & Grind ☀️ / Plan today for your success ...,2019-05-13T15:41:07+0000,10157236349992463_10157237648947463,da39a3ee5e6b4b0d3255bfef95601890afd80709,0,I think some needs to hear this. Over the year...,2019-05-14T03:28:34+0000,0,,


## Data Pre-processing

In [3]:
# Drop unnecessary columns
drop_columns = ['position', 'post_id', 'post_by', 'post_text', 
                'post_published', 'comment_id', 'attachment_url']
df = df.drop(drop_columns, axis=1)

In [4]:
# Check for null values
df.isnull().sum()

comment_by                0
is_reply                  0
comment_message         216
comment_published         0
comment_like_count        0
attachment_type       21520
dtype: int64

In [5]:
# Delete comments that are posted by pageowner
df = df.drop(df[df.comment_by == 'pageowner'].index)

# Drop comment_by column
df = df.drop(['comment_by'], axis=1)

# Drop null values
df = df.dropna(subset=['comment_message'])

In [6]:
def delete_url(text):
    '''
    To delete rows that looks like a URL
    (https, http, www)
    '''
    return re.sub(r'''(?i)\b((?:http[s]?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', 'None', text)

In [7]:
# To delete and drop URL comments
df['comment_message'] = df['comment_message'].apply(delete_url)
df = df.drop(df[df.comment_message == 'None'].index)
df.head()

Unnamed: 0,is_reply,comment_message,comment_published,comment_like_count,attachment_type
0,1,Why?,2019-06-18T00:08:26+0000,0,
1,0,The worst card ever it’s lawsuit time,2019-06-04T19:53:28+0000,6,
2,0,I think some needs to hear this. Over the year...,2019-05-14T03:28:34+0000,0,
3,0,"How long does it take to get refunded money, P...",2019-04-20T20:44:07+0000,1,
10,0,REALLY? Recipes From Heaven,2019-03-15T22:37:37+0000,0,


## Save as comments_no_null.tsv

In [8]:
df.to_csv('C:/Users/cherryb/Desktop/Personal Projects/Datasets/Telus - Fintech/cleaned/commentsCleaned_VSent-TBlob.tsv', sep='\t')