# Notebook for topic modeling (partial)

# Imports

In [45]:
## load packages 
import pandas as pd
import re
import numpy as np

## nltk imports
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


# Load data

In [7]:
ab = pd.read_csv("../public_data/airbnb_text.zip")
ab.head()

Unnamed: 0,id,name,name_upper,neighbourhood_group,price
0,2539,Clean & quiet apt home by the park,CLEAN & QUIET APT HOME BY THE PARK,Brooklyn,149
1,2595,Skylit Midtown Castle,SKYLIT MIDTOWN CASTLE,Manhattan,225
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,THE VILLAGE OF HARLEM....NEW YORK !,Manhattan,150
3,3831,Cozy Entire Floor of Brownstone,COZY ENTIRE FLOOR OF BROWNSTONE,Brooklyn,89
4,5022,Entire Apt: Spacious Studio/Loft by central park,ENTIRE APT: SPACIOUS STUDIO/LOFT BY CENTRAL PARK,Manhattan,80


# Preprocess prior to creating a document-term matrix

## Step 1- load stopwords and augment with our own custom ones

In [12]:
list_stopwords = stopwords.words("english")

custom_words_toadd = ['apartment', 'new york', 'nyc',
                      'bronx', 'brooklyn',
                     'manhattan', 'queens', 
                      'staten island']

list_stopwords_new = list_stopwords + custom_words_toadd


## Step 2- remove stopwords from lowercase version of corpus


In [42]:
## convert to lowercase and a list
corpus_lower = ab.name.str.lower().to_list()
corpus_lower[0:5]

## use wordpunct tokenize and filter out with one
example_listing = corpus_lower[3]
nostop_listing = [word for word in wordpunct_tokenize(example_listing) 
                          if word not in list_stopwords_new]
nostop_listing

['clean & quiet apt home by the park',
 'skylit midtown castle',
 'the village of harlem....new york !',
 'cozy entire floor of brownstone',
 'entire apt: spacious studio/loft by central park']

['cozy', 'entire', 'floor', 'brownstone']

## Step 3- stem and remove non-alpha

Other contexts we may want to leave digits in

In [43]:
## initialize stemmer
porter = PorterStemmer()

## apply to one by iterating
## over the tokens in the list
example_listing_preprocess = [porter.stem(token) 
                            for token in nostop_listing 
                            if token.isalpha() and 
                            len(token) > 2]

example_listing_preprocess

['cozi', 'entir', 'floor', 'brownston']

In [32]:
example_listing
example_listing_preprocess

['cozy', 'entire', 'floor', 'of', 'brownstone']

['cozi', 'entir', 'floor', 'brownston']

# Activity

- Embed steps two and three into one or two functions
- Apply the function to all the texts in `corpus_lower`