# NLP Testing

In [115]:
import numpy as np
import re
import pandas as pd


In [116]:
dat = pd.read_csv("../datafiles/twitter_sentiment_data.csv")
dat.head()

Unnamed: 0,sentiment,message,tweetid
0,-1,@tiniebeany climate change is an interesting h...,792927353886371840
1,1,RT @NatGeoChannel: Watch #BeforeTheFlood right...,793124211518832641
2,1,Fabulous! Leonardo #DiCaprio's film on #climat...,793124402388832256
3,1,RT @Mick_Fanning: Just watched this amazing do...,793124635873275904
4,2,"RT @cnalive: Pranita Biswasi, a Lutheran from ...",793125156185137153


Here's a plan:

- Basics: tokenization, corpuses
- Lemmatization
- Stopwords
- Other cleaning (URLs, etc)

- Embeddings
  - Onehot
  - Word2vec

- Sentiments
- Clustering
- ML model (RF or keras)

## Text processing

In [117]:
tweet = dat.message[1]
print(tweet)

RT @NatGeoChannel: Watch #BeforeTheFlood right here, as @LeoDiCaprio travels the world to tackle climate change https://t.co/LkDehj3tNn httÃ¢â‚¬Â¦


### Cleaning

In [118]:
import re

tweet = re.sub(r"https?://\S+", "", tweet)  # Replace links with '#url'
print(tweet)

RT @NatGeoChannel: Watch #BeforeTheFlood right here, as @LeoDiCaprio travels the world to tackle climate change  httÃ¢â‚¬Â¦


In [119]:
tweet = re.sub(r"RT ", "", tweet)  # Remove RT tag
print(tweet)

@NatGeoChannel: Watch #BeforeTheFlood right here, as @LeoDiCaprio travels the world to tackle climate change  httÃ¢â‚¬Â¦


In [120]:
tweet = re.sub(r"@\w+", "", tweet) ## Remove @users
print(tweet)

: Watch #BeforeTheFlood right here, as  travels the world to tackle climate change  httÃ¢â‚¬Â¦


In [121]:
tweet = re.sub(r"#\S+", "",tweet)
print(tweet)

: Watch  right here, as  travels the world to tackle climate change  httÃ¢â‚¬Â¦


### Tokenization

In [144]:
from nltk.tokenize import TweetTokenizer

tokens = TweetTokenizer().tokenize(tweet)
print(tokens)

[':', 'Watch', 'right', 'here', ',', 'as', 'travels', 'the', 'world', 'to', 'tackle', 'climate', 'change', 'httÃ', '¢', 'â', '‚', '¬', 'Â', '¦']


### Remove junk

In [145]:
nltk.download('words')

[nltk_data] Downloading package words to /Users/u0784726/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [148]:
from nltk.corpus import words

nltk.download('words')

vocab_temp =  ['hello world'] * 100
keep_words = words.words() 

[x for x in vocab_temp if x in keep_words]
[x for x in vocab_temp if x in tokens]

[nltk_data] Downloading package words to /Users/u0784726/nltk_data...
[nltk_data]   Package words is already up-to-date!


NameError: name 'x' is not defined

In [153]:
for token in tokens:
    if token.lower() in keep_words:
        print(token)



Watch
right
here
as
the
world
to
tackle
climate
change


In [154]:
keep_words

['A',
 'a',
 'aa',
 'aal',
 'aalii',
 'aam',
 'Aani',
 'aardvark',
 'aardwolf',
 'Aaron',
 'Aaronic',
 'Aaronical',
 'Aaronite',
 'Aaronitic',
 'Aaru',
 'Ab',
 'aba',
 'Ababdeh',
 'Ababua',
 'abac',
 'abaca',
 'abacate',
 'abacay',
 'abacinate',
 'abacination',
 'abaciscus',
 'abacist',
 'aback',
 'abactinal',
 'abactinally',
 'abaction',
 'abactor',
 'abaculus',
 'abacus',
 'Abadite',
 'abaff',
 'abaft',
 'abaisance',
 'abaiser',
 'abaissed',
 'abalienate',
 'abalienation',
 'abalone',
 'Abama',
 'abampere',
 'abandon',
 'abandonable',
 'abandoned',
 'abandonedly',
 'abandonee',
 'abandoner',
 'abandonment',
 'Abanic',
 'Abantes',
 'abaptiston',
 'Abarambo',
 'Abaris',
 'abarthrosis',
 'abarticular',
 'abarticulation',
 'abas',
 'abase',
 'abased',
 'abasedly',
 'abasedness',
 'abasement',
 'abaser',
 'Abasgi',
 'abash',
 'abashed',
 'abashedly',
 'abashedness',
 'abashless',
 'abashlessly',
 'abashment',
 'abasia',
 'abasic',
 'abask',
 'Abassin',
 'abastardize',
 'abatable',
 'abate

### Stemming

In [123]:
## First time - downloads dictionary
# import nltk
# nltk.download("punkt")

In [124]:
from nltk.stem import PorterStemmer
# Initialize Python porter stemmer
ps = PorterStemmer()
# Example inflections to reduce
example_words = ["program","programming","programer","programs","programmed"]

In [125]:
for word in example_words:
    print(f"{word}: {ps.stem(word)}")

program: program
programming: program
programer: program
programs: program
programmed: program


In [143]:
for word in tokens:
    print(f"{word}: {ps.stem(word)}")

:: :
Watch: watch
right: right
here: here
,: ,
as: as
travels: travel
the: the
world: world
to: to
tackle: tackl
climate: climat
change: chang
httÃ: httã
¢: ¢
â: â
‚: ‚
¬: ¬
Â: â
¦: ¦


### Lemmatization

Will need to download `en_core_web_sm`. ???

In [126]:
import spacy
nlp = spacy.load('en_core_web_sm')

s = "I saw two mice today!"
print(s)
print(" ".join([token.lemma_ for token in nlp(s)]))

I saw two mice today!
I see two mouse today !


In [127]:
tweet2 = [token.lemma_ for token in nlp(tweet)]
print(tweet2)

[':', 'watch', ' ', 'right', 'here', ',', 'as', ' ', 'travel', 'the', 'world', 'to', 'tackle', 'climate', 'change', ' ', 'httÃ¢â‚¬Â', '¦']


In [128]:
tweet

': Watch  right here, as  travels the world to tackle climate change  httÃ¢â‚¬Â¦'

### Stopwords

In [131]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/u0784726/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [135]:
from nltk.corpus import stopwords
stop = set(stopwords.words("english"))

In [136]:
# Remove stop words
tokens_wo_stopwords = [t for t in tokens if t not in stop]
print("Text without stop words:", " ".join(tokens_wo_stopwords))

Text without stop words: : Watch right , travels world tackle climate change httÃ ¢ â ‚ ¬ Â ¦


In [139]:
from nltk.corpus import stopwords,words

nltk.download('words')

vocab_temp =  ['hello world'] * 100
keep_words = words.words() 

[x for x in vocab_temp if x in keep_words]

[nltk_data] Downloading package words to /Users/u0784726/nltk_data...
[nltk_data]   Package words is already up-to-date!


NameError: name 'x' is not defined

In [142]:
for word in tokens:
    print(word)
    if word in keep_words:
        print(word)



:
Watch
right
right
here
here
,
as
as
travels
the
the
world
world
to
to
tackle
tackle
climate
climate
change
change
httÃ
¢
â
‚
¬
Â
¦


In [18]:
### Tokenization
from nltk.tokenize import TweetTokenizer

s = "This system combines #solar with #wind turbines. #ActOnClimate now. #Capitalism #climate #economics"
tokens = TweetTokenizer().tokenize(dat.message[3])
print(tokens)

['RT', '@Mick_Fanning', ':', 'Just', 'watched', 'this', 'amazing', 'documentary', 'by', 'leonardodicaprio', 'on', 'climate', 'change', '.', 'We', 'all', 'think', 'thisÃ', '¢', 'â', '‚', '¬', 'Â', '¦', 'https://t.co/kNSTE8K8im']


In [None]:
from nltk.tokenize import TweetTokenizer
from nltk import word_tokenize

from nltk.corpus import stopwords
stop = set(stopwords.words("english"))

import spacy
nlp = spacy.load('en_core_web_sm')