In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('Sentiment140-dataset.csv', encoding ="ISO-8859-1")

In [3]:
data.rename(columns={"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D": 'tweets'}, inplace=True)

In [4]:
print(np.sum(data.isnull().any(axis=1)))  # checking null values

0


In [5]:
X = data.iloc[:, -1]
y = data.iloc[:, 0]

In [7]:
import os 
import pickle

directory = 'processed-data/'
if not os.path.exists(directory):
    os.makedirs(directory)
    
y.to_pickle(os.path.join(directory, 'y_labels.pkl')) # saving labels to the processed-data directory for future use

In [8]:
X = X.replace(r'http\S+|https\S+', '', regex=True) # remove URLs
X = X.replace(r'@\w+', '', regex=True) # remove mentions (@user)
X = X.replace(r'#', '', regex=True) # remove hashtags
X = X.replace(r'[^a-zA-Z\s]', '', regex=True) # remove special characters and numbers (keeping only letters and whitespace)

In [9]:
X = X.str.lower() # convert text to lowercase

In [10]:
from nltk.tokenize import word_tokenize
X_tokenized = X.apply(word_tokenize) # tokenize the text
X_tokenized.to_pickle(os.path.join(directory, 'X_tokenized.pkl'))

In [11]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
X_filtered = X_tokenized.apply(lambda tokens: [word for word in tokens if word not in stop_words])
X_filtered.to_pickle(os.path.join(directory, 'X_filtered.pkl'))

In [12]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer() # initialize the stemmer
X_stemmed = X_filtered.apply(lambda x: [stemmer.stem(word) for word in x]) # apply stemming
X_stemmed.to_pickle(os.path.join(directory, 'X_stemmed.pkl'))

In [13]:
import nltk 
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
lemmatizer = WordNetLemmatizer() # initialize the lemmatizer

In [14]:
# function to map the part of speech (POS) to the format that WordNetLemmatizer accepts
def get_wordnet_pos(word):
    # return the POS tag of the word in tupple ('running', 'VGB'), 
    # [0][1][0] extract the first letter of the POS tag 'VGB', so it becomes 'V'
    tag = nltk.pos_tag([word])[0][1][0].upper() 
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV} # mapping
    return tag_dict.get(tag, wordnet.NOUN)

In [15]:
X_lemmatized = X.apply(lambda tokens: [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in tokens])
X_lemmatized.to_pickle(os.path.join(directory, 'X_lemmatized.pkl'))