# Data Cleaning - comments.tab

Description:

## Import Libraries

### Main Libraries

In [1]:
import pandas as pd
import numpy as np
import re

### NLP Libraries

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

## Load Dataset

In [3]:
# Load dataset
df = pd.read_csv('C:/Users/cherryb/Desktop/Personal Projects/Datasets/Telus - Fintech/comments_no_null.tsv', sep='\t')
# Inspect df
df.head(3)

Unnamed: 0.1,Unnamed: 0,is_reply,comment_message,comment_published,comment_like_count,attachment_type
0,0,1,Why?,2019-06-18T00:08:26+0000,0,
1,1,0,The worst card ever it’s lawsuit time,2019-06-04T19:53:28+0000,6,
2,2,0,I think some needs to hear this. Over the year...,2019-05-14T03:28:34+0000,0,


## Data Pre-processing

In [4]:
# Drop unnecessary columns
drop_columns = ['Unnamed: 0']
df = df.drop(drop_columns, axis=1)

In [5]:
# Tokenize the posts
df['comment_message'] = df['comment_message'].apply(lambda list_words: word_tokenize(list_words))

In [6]:
def remove_nonalpha(text):
    '''
    Removing non-alpha characters
    '''
    return re.sub('[^a-zA-Z]', '', text)

In [7]:
# Run the function to remove non-letter characters
df['comment_message'] = df['comment_message'].apply(lambda list_words: [remove_nonalpha(word) for word in list_words])

In [8]:
# Remove empty strings
df['comment_message'] = df['comment_message'].apply(lambda list_words: list(filter(None, list_words)))

In [9]:
# Convert all letters to lowercase
df['comment_message'] = df['comment_message'].apply(lambda list_words: [word.lower() for word in list_words])

In [10]:
# Remove stopwords
stop_words = sorted(stopwords.words('english'))
df['comment_message'] = df['comment_message'].apply(lambda list_words: [word for word in list_words if not word in stop_words])

In [11]:
# Sort words by grouping inflected forms of the same word
lem = WordNetLemmatizer()
df['comment_message'] = df['comment_message'].apply(lambda list_words: [lem.lemmatize(word) for word in list_words])

## Save as comments_cleaned.tsv

In [12]:
df.to_csv('C:/Users/cherryb/Desktop/Personal Projects/Datasets/Telus - Fintech/cleaned/comments_cleaned.tsv', sep='\t')