In [1]:
import pandas as pd
import numpy as np
import re
import emoji

# Cleaning Data

In [2]:
data = pd.read_csv('./data/english_only.csv').drop(columns='Unnamed: 0')
data.head()

Unnamed: 0,text,label
0,@Pericles216 @HierBeforeTheAC @Sachinettiyil T...,intj
1,@Hispanthicckk Being you makes you look cute||...,intj
2,"I'm like entp but idiotic|||Hey boy, do you wa...",intj
3,@kaeshurr1 Give it to @ZargarShanif ... He has...,intj
4,Frances Farmer Will Have Her Revenge On Seattl...,intj


In [3]:
text_before_cleaning = data['text'].iloc[0][:300]

In [4]:
def clean_text(tweets):
    
    # remove characters separating tweets from each other
    tweets = tweets.replace('|||', '')

    # remove emojis
    tweets = emoji.replace_emoji(tweets, replace='')
    
    # remove links 
    tweets = re.sub(r'http\S+|www\S+|https\S+', '', tweets, flags=re.MULTILINE)
    
    # make lowercase
    tweets = tweets.lower()

    # remove twitter handles
    tweets = re.sub(r'@\w+', '', tweets)

    # remove extra whitespace
    tweets = re.sub(r'\s+', ' ', tweets).strip()
    
    return tweets

In [5]:
text_after_cleaning = clean_text(text_before_cleaning)

In [6]:
print(f'Text before cleaning:\n{text_before_cleaning}')
print('\n\n')
print(f'Text after cleaning:\n{text_after_cleaning}')

Text before cleaning:
@Pericles216 @HierBeforeTheAC @Sachinettiyil The Pope is infallible, this is a catholic dogma 

It doesn’t mean the… https://t.co/qmt0ezk0Ey|||@HierBeforeTheAC @Pericles216 @Sachinettiyil And by perpetually it entails that the church herself can elect new po… https://t.co/OGTxKfUDHQ|||@HierBeforeThe



Text after cleaning:
the pope is infallible, this is a catholic dogma it doesn’t mean the… and by perpetually it entails that the church herself can elect new po…


In [7]:
data['cleaned text'] = data['text'].apply(clean_text)
data.head()

Unnamed: 0,text,label,cleaned text
0,@Pericles216 @HierBeforeTheAC @Sachinettiyil T...,intj,"the pope is infallible, this is a catholic dog..."
1,@Hispanthicckk Being you makes you look cute||...,intj,"being you makes you look cute on, because then..."
2,"I'm like entp but idiotic|||Hey boy, do you wa...",intj,"i'm like entp but idiotichey boy, do you want ..."
3,@kaeshurr1 Give it to @ZargarShanif ... He has...,intj,give it to ... he has pica since childhood say...
4,Frances Farmer Will Have Her Revenge On Seattl...,intj,frances farmer will have her revenge on seattl...


In [8]:
data_cleaned = data.get(['cleaned text', 'label'])
data_cleaned.head()

Unnamed: 0,cleaned text,label
0,"the pope is infallible, this is a catholic dog...",intj
1,"being you makes you look cute on, because then...",intj
2,"i'm like entp but idiotichey boy, do you want ...",intj
3,give it to ... he has pica since childhood say...,intj
4,frances farmer will have her revenge on seattl...,intj


In [9]:
data_cleaned.to_csv('./data/cleaned.csv')