# Imports

In [None]:
## load packages 
import pandas as pd
import re
import numpy as np

## nltk imports
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize

## uncomment and download if this is your first 
## time running 
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

## sentiment analysis
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

## specify to print all output in a call
## and not just first
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
## spacy --- if you get an error at the load step
## need to download en_core_web_sm - try running this in terminal:
## python -m spacy download en_core_web_sm
import spacy
sp = spacy.load('en_core_web_sm')
import en_core_web_sm
nlp = en_core_web_sm.load()

# Load data 

In [None]:
## if working from within the repo, can use this relative path
path_todata = "../../public_data/airbnb_text.zip"

## load data
ab = pd.read_csv(path_todata)
ab.head()
ab.info()

# 1. Text mining

## Manual approach 1: look for a single word

Here, we're going to create a variable `is_cozy` that looks at whether the listing contains the word cozy

Then, we're going to summarize the percentage of listings that contain that word by borough -- `neighbourhood_group`

## Manual approach 2: score based on dictionary of words

In [None]:
## construct dictionary
space_indicators = {'small': ['COZY', 'COMFY', 'LITTLE', 'SMALL'],
                   'large': ['SPACIOUS', 'LARGE', 'HUGE', 'GIANT']}
space_indicators['small']




# 2. Part of speech tagging

Tag the likely part of speech within each token in a sentence

In [None]:
## specify example
example_for_tag = "This is a chill apt next to the subway in LES Chinatown"
example_for_tag

In [None]:
## try part of speech tagging using nltk


In [None]:
## use list iteration to extract proper nouns (NNP)
## i'm first checking if the second element in the tuple
## is equal to NNP
## if so, i'm returning the first element in the tuple (the 
## actual word)


# 3. Named Entity Recognition



In [None]:
## modified from a real tweet

## tweet
m_tweet = """When AOL was founded in the year 2000 I remember raising my hand
and asking 'well...if you're using my data - where is my $50' 
and no one had a good answer...and there still isn't one.
"""

In [None]:
## nep for one tweet

In [None]:
## nep for multiple strings

# 4. Sentiment analysis

### Using the default scorer on a few example phrases

In [None]:
## initialize a scorer
sent_obj = SentimentIntensityAnalyzer()
print(type(sent_obj))
## score one listing
practice_listing = "NICE AND COZY LITTLE APT AVAILABLE"


In [None]:
## adding phrase with word terrible and score
practice_listing_2 = "NICE AND COZY LITTLE APT AVAILABLE. REALLY TERRIBLE VIEW."



In [None]:
## adding phrase about rats; bad but might not be in scoring dictionary
practice_listing_3 = "NICE AND COZY LITTLE APT AVAILABLE. HAS RATS THOUGH."


In [None]:
## summarize all 3
print("String: " + practice_listing + " scored as:\n" + str(sentiment_example))
print("String: " + practice_listing_2 + " scored as:\n" + str(sentiment_example_2))
print("String: " + practice_listing_3 + " scored as:\n" + str(sentiment_example_3))


### Updating the dictionary with manually-added words

In [None]:
print(type(sent_obj.lexicon))

In [None]:
## lexicon is a dictionary where the key
## is the word
## the value is the score (negative = negative)
## here, i'm benchmarking the negativity of the
## rodents to the negativity of the word aversion
sent_obj.lexicon['aversion']

In [None]:
## create a dictionary with 
## negative scores for pests
pest_words = {
    'rat': -1.9,
    'rats': -1.9,
    'mice': -1.9,
    'mouse': -1.9,
    'roach': -1.9,
    'cockroach': -1.9
}


## initiate new sentiment object
## so that we don't alter old one
## use.update to add new words
