# Import dependencies

In [3]:
import pandas as pd
import numpy as np
import json
import io
import datetime as dt
import string
import unicodedata

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')

import spacy
nlp = spacy.load('en_core', parse=True, tag=True, entity=True)

import re
from bs4 import BeautifulSoup


from gensim import corpora, models, similarities

#nlp_vec = spacy.load('en_vecs', parse = True, tag=True, #entity=True)

# Read in data

In [4]:
#create dataframe from input data; identify the column with the tweet text
df=pd.read_csv('tweets.csv')
print('Number of observations are: '+str(len(df)))

Number of observations are: 14640


In [5]:
#remove any rows that has no tweet text
df=df.text.dropna()
df = df.reset_index(drop=True)
print('Number of observations are: '+str(len(df)))

Number of observations are: 14640


In [6]:
#Create dictionary of all tweets
tweet_dictionary = {}
i = 0
for line in df:
        tweet_dictionary[i] = line.lower()
        i += 1
print(tweet_dictionary[1])

@virginamerica plus you've added commercials to the experience... tacky.


# Data Preprocessing I

Remove HTML Links

In [7]:
def strip_links(text):
    link_regex    = re.compile('((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
    links         = re.findall(link_regex, text)
    for link in links:
        text = text.replace(link[0], ', ')    
    return text

In [8]:
for i in range(0,len(tweet_dictionary)):
    tweet_dictionary[i]=strip_links(tweet_dictionary[i])

Remove mentions

In [9]:
def strip_mentions(text):
    entity_prefixes = ['@']
    for separator in  string.punctuation:
        if separator not in entity_prefixes :
            text = text.replace(separator,' ')
    words = []
    for word in text.split():
        word = word.strip()
        if word:
            if word[0] not in entity_prefixes:
                words.append(word)
    return ' '.join(words)

In [10]:
for i in range(0,len(tweet_dictionary)):
    tweet_dictionary[i]=strip_mentions(tweet_dictionary[i])

Remove hashtags

In [11]:
def strip_hashtags(text):
    entity_prefixes = ['#']
    for separator in  string.punctuation:
        if separator not in entity_prefixes :
            text = text.replace(separator,' ')
    words = []
    for word in text.split():
        word = word.strip()
        if word:
            if word[0] not in entity_prefixes:
                words.append(word)
    return ' '.join(words)

In [12]:
for i in range(0,len(tweet_dictionary)):
    tweet_dictionary[i]=strip_hashtags(tweet_dictionary[i])

Remove retweet (RT) designation

In [13]:
for i in range(0,len(df)):
    tweet_dictionary[i] = tweet_dictionary[i].replace('RT', '')

In [14]:
def strip_all_entities(text):
    entity_prefixes = ['@','#']
    for separator in  string.punctuation:
        if separator not in entity_prefixes :
            text = text.replace(separator,' ')
    words = []
    for word in text.split():
        word = word.strip()
        if word:
            if word[0] not in entity_prefixes:
                words.append(word)
    return ' '.join(words)

# Data Preprocessing II

Remove spcial or accented characters (and maybe numbers)

In [15]:
def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

In [16]:
for i in range(0,len(tweet_dictionary)):
    tweet_dictionary[i]=remove_special_characters(tweet_dictionary[i], 
                          remove_digits=True)

Stopwords

In [17]:
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [18]:
for i in range(0,len(tweet_dictionary)):
    tweet_dictionary[i]=remove_stopwords(tweet_dictionary[i])

Stemming / Lemming

In [19]:
def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

In [62]:
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [63]:
for i in range(0,len(tweet_dictionary)):
    tweet_dictionary[i]=lemmatize_text(tweet_dictionary[i])

Tokenization

In [20]:
#Create corupus of all words
words_corpus = []
for i in range(0,len(tweet_dictionary)):
        words_corpus.append(tweet_dictionary[i].lower().split())
print(words_corpus)



In [21]:
dictionary = corpora.Dictionary(words_corpus)
print(dictionary)
corpus = [dictionary.doc2bow(text) for text in words_corpus]

Dictionary(11153 unique tokens: ['said', 'added', 'commercials', 'experience', 'plus']...)


# Data Analysis

Part of Speech Tagging

In [23]:
sentence = 'London is the capital and most populous city of England and the United Kingdom'

In [24]:
sentence_nlp = nlp(sentence)

# POS tagging with Spacy 
spacy_pos_tagged = [(word, word.tag_, word.pos_) for word in sentence_nlp]
pd.DataFrame(spacy_pos_tagged, columns=['Word', 'POS tag', 'Tag type'])

# POS tagging with nltk
#nltk_pos_tagged = nltk.pos_tag(sentence.split())
#pd.DataFrame(nltk_pos_tagged, columns=['Word', 'POS tag'])

Unnamed: 0,Word,POS tag,Tag type
0,London,NNP,PROPN
1,is,VBZ,VERB
2,the,DT,DET
3,capital,NN,NOUN
4,and,CC,CCONJ
5,most,RBS,ADV
6,populous,JJ,ADJ
7,city,NN,NOUN
8,of,IN,ADP
9,England,NNP,PROPN


Dependency Graph

In [27]:
from spacy import displacy
displacy.render(sentence_nlp, jupyter=True, 
                options={'distance': 110,
                         'arrow_stroke': 2,
                         'arrow_width': 8})

Named Entity Recognition

In [26]:
# print named entities in article
print([(word, word.ent_type_) for word in sentence_nlp if word.ent_type_])

# visualize named entities
displacy.render(sentence_nlp, style='ent', jupyter=True)

[(London, 'GPE'), (England, 'GPE'), (the, 'GPE'), (United, 'GPE'), (Kingdom, 'GPE')]


Similarity

In [104]:
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)
query = "apple banana strawberry"
vec_bow = dictionary.doc2bow(query.lower().split())
vec_lsi = lsi[vec_bow]
print(vec_lsi)
index = similarities.MatrixSimilarity(lsi[corpus])
sims = index[vec_lsi]

[(0, 0.0004018640463104192), (1, -5.492274233986752e-05)]


Sentiment

In [105]:
#find sentiment vader
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

In [106]:
snt = analyser.polarity_scores('This is an examle of a happy tweet')
print(snt['compound'])
      

0.5719
