# Data Cleaning - fullstat

Description: 
- fullstat split into to fullstat_labeled and fullstat_without
- cleaned fullstat_labeled and fullstat_without ready for text classification

## Import Libraries

### Main Libraries

In [1]:
import pandas as pd
import numpy as np
import re

### NLP Libraries

In [2]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

## Load Dataset (fullstat_labeled)

In [3]:
# Load dataset
df_labeled = pd.read_csv('C:/Users/cherryb/Desktop/Personal Projects/Datasets/Telus - Fintech/fullstat_labeled.tsv', sep='\t')
# Inspect df
df_labeled.head()

Unnamed: 0.1,Unnamed: 0,type,by,post_id,post_link,post_message,picture,full_picture,link,link_domain,...,comments_base,comments_replies,comment_likes_count,rea_LOVE,rea_WOW,rea_HAHA,rea_SAD,rea_ANGRY,rea_THANKFUL,target
0,0,photo,post_page_155027942462,155027942462_10157348020627463,https://www.facebook.com/155027942462/posts/10...,Paying your bills on auto-pay ensures that bil...,https://scontent.xx.fbcdn.net/v/t1.0-0/p130x13...,https://scontent.xx.fbcdn.net/v/t1.0-9/p720x72...,https://www.facebook.com/GreenDot/photos/a.440...,facebook.com,...,0,0,0,0,0,0,0,0,0,App Update
1,1,photo,post_page_155027942462,155027942462_10157333387457463,https://www.facebook.com/155027942462/posts/10...,Raise your hand if you are excited for the fir...,https://scontent.xx.fbcdn.net/v/t1.0-0/p130x13...,https://scontent.xx.fbcdn.net/v/t1.0-9/p720x72...,https://www.facebook.com/GreenDot/photos/a.440...,facebook.com,...,0,0,0,4,1,0,0,1,0,Engagement
2,2,photo,post_page_155027942462,155027942462_10157330985232463,https://www.facebook.com/155027942462/posts/10...,Couples who save together stay forever 🥰 Here...,https://scontent.xx.fbcdn.net/v/t1.0-0/p130x13...,https://scontent.xx.fbcdn.net/v/t1.0-9/p720x72...,https://www.facebook.com/GreenDot/photos/a.440...,facebook.com,...,0,0,0,0,0,0,0,1,0,Engagement
3,3,photo,post_page_155027942462,155027942462_10157323881577463,https://www.facebook.com/155027942462/posts/10...,In case you’ve forgotten: Make saving a daily ...,https://scontent.xx.fbcdn.net/v/t1.0-0/p130x13...,https://scontent.xx.fbcdn.net/v/t1.0-9/p720x72...,https://www.facebook.com/GreenDot/photos/a.440...,facebook.com,...,0,0,0,0,0,0,0,1,0,Engagement
4,4,video,post_page_155027942462,155027942462_10157315990422463,https://www.facebook.com/155027942462/posts/10...,Father’s Day is approaching. We got smart ways...,https://external.xx.fbcdn.net/safe_image.php?d...,https://external.xx.fbcdn.net/safe_image.php?d...,https://go.greendot.com/blog/smart-fathers-day...,go.greendot.com,...,0,0,0,0,0,0,0,3,0,Engagement


## Data Pre-processing

In [4]:
# Drop unnecessary columns
drop_columns = ['Unnamed: 0', 'post_id', 'post_link', 'picture',
               'full_picture', 'link', 'link_domain', 'post_published_unix',
               'post_published_sql']
df_labeled = df_labeled.drop(drop_columns, axis=1)

In [5]:
# Tokenize the posts
df_labeled['post_message'] = df_labeled['post_message'].apply(lambda list_words: word_tokenize(list_words))

In [6]:
def remove_nonalpha(text):
    '''
    Removing non-alpha characters
    '''
    return re.sub('[^a-zA-Z]', '', text)

In [7]:
# Run the function to remove non-letter characters
df_labeled['post_message'] = df_labeled['post_message'].apply(lambda list_words: [remove_nonalpha(word) for word in list_words])

In [8]:
# Remove empty strings
df_labeled['post_message'] = df_labeled['post_message'].apply(lambda list_words: list(filter(None, list_words)))

In [9]:
# Convert all letters to lowercase
df_labeled['post_message'] = df_labeled['post_message'].apply(lambda list_words: [word.lower() for word in list_words])

In [10]:
# Remove stopwords
stop_words = sorted(stopwords.words('english'))
df_labeled['post_message'] = df_labeled['post_message'].apply(lambda list_words: [word for word in list_words if not word in stop_words])

In [11]:
# Sort words by grouping inflected forms of the same word
lem = WordNetLemmatizer()
df_labeled['post_message'] = df_labeled['post_message'].apply(lambda list_words: [lem.lemmatize(word) for word in list_words])

## Save as fullstat_cleaned.tsv

In [12]:
df_labeled.to_csv('C:/Users/cherryb/Desktop/Personal Projects/Datasets/Telus - Fintech/cleaned/fullstat_withLabels_cleaned.tsv', sep='\t')

----------

## Load Dataset (fullstat_unlabeled)

In [13]:
# Load dataset
df_without = pd.read_csv('C:/Users/cherryb/Desktop/Personal Projects/Datasets/Telus - Fintech/fullstat_without.tsv', sep='\t')
# Inspect df
df_without.head()

Unnamed: 0,post_id,type,by,post_link,post_message,picture,full_picture,link,link_domain,post_published,...,comments_retrieved,comments_base,comments_replies,comment_likes_count,rea_LOVE,rea_WOW,rea_HAHA,rea_SAD,rea_ANGRY,rea_THANKFUL
0,155027942462_10156111760852463,photo,post_page_155027942462,https://www.facebook.com/155027942462/posts/10...,😎 / @taylorlatimerofficial loves that you can ...,https://scontent.xx.fbcdn.net/v/t1.0-0/p130x13...,https://scontent.xx.fbcdn.net/v/t31.0-8/p720x7...,https://www.facebook.com/GreenDot/photos/a.440...,facebook.com,2018-02-14T22:46:01+0000,...,69,20,49,45,0,1,0,0,7,0
1,155027942462_10156293436177463,video,post_page_155027942462,https://www.facebook.com/GreenDot/videos/10156...,Here’s a little #MondayMotivation from us to y...,https://scontent.xx.fbcdn.net/v/t15.5256-10/p1...,https://scontent.xx.fbcdn.net/v/t15.5256-10/27...,https://www.facebook.com/GreenDot/videos/10156...,facebook.com,2018-04-16T16:00:34+0000,...,8,8,0,0,4,0,0,0,0,0
2,155027942462_10156639430782463,photo,post_page_155027942462,https://www.facebook.com/155027942462/posts/10...,How are you celebrating Labor Day? Tell us in ...,https://scontent.xx.fbcdn.net/v/t1.0-0/p130x13...,https://scontent.xx.fbcdn.net/v/t1.0-9/p720x72...,https://www.facebook.com/GreenDot/photos/a.440...,facebook.com,2018-09-03T17:23:05+0000,...,3,3,0,0,0,0,0,0,0,0
3,155027942462_10156454906152463,link,post_page_155027942462,https://www.facebook.com/155027942462/posts/10...,Budget. Personal Savings Plans. Prepaid Debit ...,https://external.xx.fbcdn.net/safe_image.php?d...,https://external.xx.fbcdn.net/safe_image.php?d...,https://go.greendot.com/blog/expert-tips-budge...,go.greendot.com,2018-06-21T16:30:02+0000,...,1,1,0,0,0,0,0,1,2,0
4,155027942462_10156386241517463,link,post_page_155027942462,https://www.facebook.com/155027942462/posts/10...,You can do this / Learn how to free yourself f...,https://external.xx.fbcdn.net/safe_image.php?d...,https://external.xx.fbcdn.net/safe_image.php?d...,https://go.greendot.com/blog/free-paycheck-pay...,go.greendot.com,2018-05-25T13:00:09+0000,...,4,4,0,2,0,1,1,0,1,0


## Data Pre-processing

In [14]:
# Drop unnecessary columns
drop_columns2 = ['post_id', 'post_link', 'picture',
               'full_picture', 'link', 'link_domain', 'post_published_unix',
               'post_published_sql']
df_without = df_without.drop(drop_columns2, axis=1)

In [15]:
# Tokenize the posts
df_without['post_message'] = df_without['post_message'].apply(lambda list_words: word_tokenize(list_words))

In [16]:
# Run the function to remove non-letter characters
df_without['post_message'] = df_without['post_message'].apply(lambda list_words: [remove_nonalpha(word) for word in list_words])

In [17]:
# Remove empty strings
df_without['post_message'] = df_without['post_message'].apply(lambda list_words: list(filter(None, list_words)))

In [18]:
# Convert all letters to lowercase
df_without['post_message'] = df_without['post_message'].apply(lambda list_words: [word.lower() for word in list_words])

In [19]:
# Remove stopwords
df_without['post_message'] = df_without['post_message'].apply(lambda list_words: [word for word in list_words if not word in stop_words])

In [20]:
# Sort words by grouping inflected forms of the same word
df_without['post_message'] = df_without['post_message'].apply(lambda list_words: [lem.lemmatize(word) for word in list_words])

## Save as fullstat_cleaned.tsv

In [21]:
df_without.to_csv('C:/Users/cherryb/Desktop/Personal Projects/Datasets/Telus - Fintech/cleaned/fullstat_without_cleaned.tsv', sep='\t')