In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('Sentiment140-dataset.csv', encoding ="ISO-8859-1")

In [3]:
data.rename(columns={"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D": 'tweets'}, inplace=True)

In [4]:
print(np.sum(data.isnull().any(axis=1)))  # checking null values

0


In [5]:
X = data.iloc[:, -1]
y = data.iloc[:, 0]

In [6]:
X = X.replace(r'http\S+|https\S+', '', regex=True) # remove URLs
X = X.replace(r'@\w+', '', regex=True) # remove mentions (@user)
X = X.replace(r'#', '', regex=True) # remove hashtags
X = X.replace(r'[^a-zA-Z\s]', '', regex=True) # remove special characters and numbers (keeping only letters and whitespace)

In [7]:
X = X.str.lower() # convert text to lowercase

In [8]:
from nltk.tokenize import word_tokenize
X_tokenized = X.apply(word_tokenize) # tokenize the text

In [13]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
X_filtered = X_tokenized.apply(lambda tokens: [word for word in tokens if word not in stop_words])

In [14]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer() # initialize the stemmer
X_stemmed = X_filtered.apply(lambda x: [stemmer.stem(word) for word in x]) # apply stemming

In [15]:
import nltk 
nltk.download('wordnet') 
nltk.download('omw-1.4') 
nltk.download('averaged_perceptron_tagger')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
lemmatizer = WordNetLemmatizer() # initialize the lemmatizer

[nltk_data] Downloading package wordnet to /home/slavka/nltk_data...
[nltk_data] Downloading package omw-1.4 to /home/slavka/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/slavka/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [16]:
# function to map the part of speech (POS) to the format that WordNetLemmatizer accepts
def get_wordnet_pos(word):
    # return the POS tag of the word in tupple ('running', 'VGB'), 
    # [0][1][0] extract the first letter of the POS tag 'VGB', so it becomes 'V'
    tag = nltk.pos_tag([word])[0][1][0].upper() 
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV} # mapping
    return tag_dict.get(tag, wordnet.NOUN)

In [18]:
X_lemmatized = X.apply(lambda tokens: [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in tokens])