In [1]:
import json
import pandas as pd

import string
from nltk.corpus import stopwords

import gensim
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
from nltk.tokenize import TweetTokenizer, RegexpTokenizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [2]:
# Stopwords
stopwords = list(set(gensim.parsing.preprocessing.STOPWORDS))  # here I use gensim stop words
add_stop = ['said', 'say', 'like', 'cnn', 'thi', 'try', 'menu', 'dish', 'special', 'fyi', 'good', 'nyc', 'think']
rmv_all = list(set(stopwords + add_stop))

In [4]:
# Contraction
c_dict = { "'s": " is", "'re": " are", "'ve": " have", "'ll": " will", 
          "ain't": "am not", "aren't": "are not", "can't": "cannot", 
          "couldn't": "could not", "didn't": "did not","doesn't": "does not",
          "don't": "do not", "hadn't": "had not", "hasn't": "has not","haven't": "have not","he'd": "he would",
          "how'd": "how did", "i'd": "I would", "i'm": "i am", "isn't": "is not", "it'd": "it had",
          "let's": "let us", "mayn't": "may not", "mightn't": "might not",
          "must've": "must have", "mustn't": "must not", "needn't": "need not", "oughtn't": "ought not",
          "she'd": "she would", "shouldn't": "should not", "that'd": "that would", "there'd": "there had", "they'd": "they would",  
          "wasn't": "was not", "weren't": "were not", "won't": "will not", "wouldn't": "would not", "y'all": "you all",
          "you'd": "you had",
}

# compile the contraction dictionary
c_re = re.compile('(%s)' % '|'.join(c_dict.keys()))
def expand_contractions(text, c_re = c_re):
    def replace(match):
        return c_dict[match.group(0)]
    return c_re.sub(replace, text)

In [5]:
# Text cleaning
def process_text(text):
    # remove some new line characters
    text = text.replace('\xa0', ' ').replace('\n', ' ')
    # convert to lower case
    text = text.lower()
    # expand contractions
    text = expand_contractions(text, c_re)
    # remove other characters and punctuations
    text = re.sub(r'[^A-Za-z]+', ' ', text)
    # remove stopwords, only remain words more than 2 characters
    text = ' '.join(word for word in text.split() if word not in rmv_all and len(word) > 2)
    return text

In [6]:
# Tokenization and Lemmatization
lemmatizer = WordNetLemmatizer()
def token_lemma(text):
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)
    words = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(words)

### Read CSV

In [7]:
# 这部分读进去的文件，把路径修改成你们的file path
df = pd.read_csv('/Users/liqingran/Desktop/part1.csv')
df.head()

Unnamed: 0,name,review
0,Thursday Kitchen,I'll start with saying their outdoor seating i...
1,Thursday Kitchen,This spot did not disappoint! These tapas were...
2,Thursday Kitchen,This NYC gem was just a magical birthday celeb...
3,Thursday Kitchen,Did my extensive research on yelp and have alw...
4,Thursday Kitchen,"I've been wanting to try this place, but every..."


In [8]:
df.tail()

Unnamed: 0,name,review
26532,Aunt Jake's,The best Italian food I had outside of Italy!\...
26533,Aunt Jake's,This place was absolutely amazing. My friends ...
26534,Aunt Jake's,My daughter and stopped in for lunch while in ...
26535,Aunt Jake's,Enjoyed lasagna and chicken parm. Sauces was g...
26536,Aunt Jake's,I came here for the first time because I heard...


### Text Cleaning and Manipulation

In [9]:
df['cleaned_review'] = df['review'].apply(process_text)
df['cleaned_review'] = df['cleaned_review'].apply(token_lemma)
df.head()

Unnamed: 0,name,review,cleaned_review
0,Thursday Kitchen,I'll start with saying their outdoor seating i...,start saying outdoor seating true accomplishme...
1,Thursday Kitchen,This spot did not disappoint! These tapas were...,spot disappoint tapa right friend dish packed ...
2,Thursday Kitchen,This NYC gem was just a magical birthday celeb...,gem magical birthday celebration evening opene...
3,Thursday Kitchen,Did my extensive research on yelp and have alw...,extensive research yelp wanted place review gl...
4,Thursday Kitchen,"I've been wanting to try this place, but every...",wanting place time wait crazy long finally col...


In [10]:
def find_length(text):
    length = len(text.split())
    return length

合并：把同一个餐厅的所有review汇总起来，一条餐厅对应一条record。

In [11]:
grouping = pd.DataFrame(df.groupby('name')['cleaned_review'].sum()).reset_index()
grouping['length'] = grouping['cleaned_review'].apply(find_length)
grouping

Unnamed: 0,name,cleaned_review,length
0,12 Chairs,beautiful place sun start shine macdougal outd...,3633
1,12 Chairs Cafe,stopped late mid week brunch super busy got se...,4014
2,1803,friend booked reservation spot dinner got hung...,4653
3,3 Times 茅庐,time time excellent dumpling pre order frozen ...,8276
4,5ive Spice,place super amazing customer service notch foo...,8263
...,...,...,...
308,Yin Ji Chang Fen Rice Roll,coming tried cheung fun place past day yin com...,6327
309,Zyara NY,zyara hidden gem walking le min monday morning...,3312
310,hanon,japan born friend wife thought place favorite ...,5898
311,nonono,nonono cute trendy date night gem opened indoo...,5714


保存文件。

In [13]:
grouping.to_csv('cleaned_file_part1.csv', index=False)