Code I wrote to prepare Twitter data for text mining for an individual assignment for a college graduate-level course. The data was obtained using the Tweepy Python package. For academic integrity, full assignment will not be uploaded.

In [1]:
import pandas as pd
import ast
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Path to file where data to be pre-processed is saved
input_file_path = 'tweets.csv'


# Path to file where final pre-preocessed file is to be saved
output_file_path = 'tweets-cleaned.csv'

# Read data into a dataframe
df = pd.read_csv(input_file_path, encoding='utf-8')

# Interpret all strings with a 'b' in front as byte strings
# and then decode using utf-8
def decode_byte(string):
    return ast.literal_eval(string).decode('utf8')

df = df.applymap(decode_byte)


# Replace 'None' string with python keyword None which means null
def replace_none(string):
    if string == 'None':
        return None
    else:
        return string

df = df.applymap(replace_none)


# Create copy of 'Text' called 'Clean_Text' to work on
df['Clean_Text'] = df['Text']


# Remove Twitter usernames tagged in tweet. 
# Usernames are prepended with '@'
df['Clean_Text'] = df['Clean_Text'].str.replace(r'(@[A-Za-z0-9_]+)', '')


# Remove all hyperlinks
df['Clean_Text'] = df['Clean_Text'].str.replace(r'(https?:/?/?\S+)',
                                                '', flags=re.IGNORECASE)

# Replace ampersands special character combinations with 'and'
df['Clean_Text'] = df['Clean_Text'].str.replace(r'&amp;', 'and')


# Remove all punctuation
df['Clean_Text'] = df['Clean_Text'].str.replace(r'[^\w\s]', '')


# Remove all newline special characters
df['Clean_Text'] = df['Clean_Text'].str.replace(r'(\n)+', ' ')


# Removed 'RT' from beginning of tweets if present
df['Clean_Text'] = df['Clean_Text'].str.replace(r'(^RT\s)', ' ')


# Remove leading and trailing whitespace
df['Clean_Text'] = df['Clean_Text'].str.strip()


# Make every letter lowercase
df['Clean_Text'] = df['Clean_Text'].str.lower()


# Remove all stop words
def filter_stop_words(string):
    stop = stopwords.words('english')
    lst = string.split(' ')
    lst = [x for x in lst if x not in stop]
    return ' '.join(w for w in lst)

df['Clean_Text'] = df['Clean_Text'].apply(filter_stop_words)


# Get rid of any extra whitespace in between words
def only_one_space(string):
    return re.sub(' +', ' ', string)

df['Clean_Text'] = df['Clean_Text'].apply(only_one_space)


# Remove any text that is only one word long
df = df[df['Clean_Text'].apply(lambda x: len(x.split(' ')) > 1)]


# Stem all words if necessary
ps = PorterStemmer()

def stem_words(string):
    lst = [ps.stem(w) for w in string.split(' ')]
    return ' '.join(lst)

df['Clean_Text'] = df['Clean_Text'].apply(stem_words)


# Output to csv preprocessed text data before tokenization
df['Clean_Text'].to_csv(output_file_path, index=False)


# Tokenize words
df['Clean_Text'] = df['Clean_Text'].str.split(' ')