In [8]:
# Primary Libraries
import pandas as pd
import numpy as np
import warnings
import re

# text processing libraries
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

# download required vocabs 
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /home/sameep/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/sameep/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/sameep/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
# global variables
warnings.filterwarnings("ignore")
COLORS = ['r', 'g', 'b', 'y']
COLUMNS = ['id', 'company', 'sentiment', 'tweet']

In [3]:
X_train = pd.read_csv("./Dataset/twitter_training.csv", names=COLUMNS, header=None)
X_test = pd.read_csv("./Dataset/twitter_validation.csv", names=COLUMNS, header=None)

# drop "id"
X_train.drop('id', axis = 1, inplace = True)
X_test.drop('id', axis = 1, inplace = True)

X_train.head()

Unnamed: 0,company,sentiment,tweet
0,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,Borderlands,Positive,I am coming to the borders and I will kill you...
2,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,Borderlands,Positive,im coming on borderlands and i will murder you...
4,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [4]:
X_train.dropna(inplace = True)
X_train.isnull().sum()
NUM_RECORDS = X_train.shape[0]

In [5]:
# remove user mentions and hashtags
no_user_mentions = []
cleaned_tweets = []
for index in range(NUM_RECORDS):
    tweet = X_train['tweet'].iloc[index]
    no_user_mentions.append(re.sub(r'@[A-Za-z0-9]+', '', tweet))
    cleaned_tweets.append(re.sub('#', '', no_user_mentions[index]))

X_train['cleaned_tweets'] = np.array(cleaned_tweets)
X_train.drop('tweet', axis = 1, inplace = True)
X_train.head()

Unnamed: 0,company,sentiment,cleaned_tweets
0,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,Borderlands,Positive,I am coming to the borders and I will kill you...
2,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,Borderlands,Positive,im coming on borderlands and i will murder you...
4,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [6]:
# text conversion
cleaned_tweets = []
for index in range(NUM_RECORDS):
    tweet = X_train['cleaned_tweets'].iloc[index]
    
    # remove numbers (if any)
    tweet = re.sub(r'[^a-z A-Z]', '', tweet)
    
    # convert to lower case
    tweet = tweet.lower()
  
    cleaned_tweets.append(tweet)

X_train['cleaned_tweets'] = np.array(cleaned_tweets)
X_train.head()

Unnamed: 0,company,sentiment,cleaned_tweets
0,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,Borderlands,Positive,i am coming to the borders and i will kill you...
2,Borderlands,Positive,im getting on borderlands and i will kill you all
3,Borderlands,Positive,im coming on borderlands and i will murder you...
4,Borderlands,Positive,im getting on borderlands and i will murder y...


In [9]:
cleaned_tweets = []
lemmatizer = WordNetLemmatizer()
ps = PorterStemmer()
for index in range(NUM_RECORDS):
    tweet = X_train['cleaned_tweets'].iloc[index]

    # tokenize tweet
    word_tokens = tweet.split()

    # remove stop words
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]

    # first stem and then lemmatize
    for count, word in enumerate(filtered_sentence):
        temp = ps.stem(word)
        filtered_sentence[count] = lemmatizer.lemmatize(temp)

    # rejoin to form sentence
    filtered_sentence = " ".join(filtered_sentence).strip()
    cleaned_tweets.append(filtered_sentence)

X_train['cleaned_tweets'] = np.array(cleaned_tweets)
X_train.head()

Unnamed: 0,company,sentiment,cleaned_tweets
0,Borderlands,Positive,im get borderland murder
1,Borderlands,Positive,come border kill
2,Borderlands,Positive,im get borderland kill
3,Borderlands,Positive,im come borderland murder
4,Borderlands,Positive,im get borderland murder
