In [211]:
import pandas as pd   
from bs4 import BeautifulSoup
import nltk
#nltk.download()
from nltk.corpus import stopwords
import re
import json
from collections import Counter


In [212]:
def balance_classes(xs, ys):

    freqs = Counter(ys)

    # the least common class is the maximum number we want for all classes
    max_allowable = freqs.most_common()[-1][1]
    num_added = {clss: 0 for clss in freqs.keys()}
    new_ys = []
    new_xs = []
    for i, y in enumerate(ys):
        if (num_added[y] < max_allowable):
            new_ys.append(y)
            new_xs.append(xs[i])
            num_added[y] += 1
    return new_xs, new_ys

In [213]:
def review_to_words( raw_review ):
    # Function to convert a raw review to a string of words
    #
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review,"lxml").get_text() 
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                             
    #
    # 4. In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    stops = set(stopwords.words("english"))                  
    # 
    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join( meaningful_words ))

In [214]:
def checkEnglishDict(clean_review,english_vocab):
    #english_vocab = set(w.lower() for w in nltk.corpus.words.words())
    meaningful_words = [w for w in clean_review.split() if w in english_vocab]  
    return( " ".join( meaningful_words ))

In [215]:
def unusual_words(text):
    text_vocab = set(w.lower() for w in text.split() if w.isalpha())
    english_vocab = set(w.lower() for w in nltk.corpus.words.words())
    unusual = text_vocab - english_vocab
    return sorted(unusual)

In [216]:
# read the data from disk and split into lines
# we use .strip() to remove the final (empty) line
with open("xaa.json",encoding="utf8") as f:
    reviews = f.read().strip().split("\n")
reviews = [json.loads(review) for review in reviews] 
texts = [review['text'] for review in reviews]
stars = [review['stars'] for review in reviews]

In [217]:
#removing stopwords, special characters
print ("Cleaning and parsing the texts set ...\n")
clean_reviews = []
num_reviews=len(texts)
for i in range( 0, num_reviews ):
    # If the index is evenly divisible by 1000, print a message
    if( (i+1)%5000 == 0 ):
        print ("Review %d of %d\n" % ( i+1, num_reviews ))                                                                    
    clean_reviews.append( review_to_words( texts[i]))

Cleaning and parsing the texts set ...

Review 5000 of 20000

Review 10000 of 20000

Review 15000 of 20000

Review 20000 of 20000



In [218]:
#Pre-processing for english words
meaningful_clean_reviews=[]
english_vocab = set(w.lower() for w in nltk.corpus.words.words())
for i in range(0,num_reviews):
    if( (i+1)%5000 == 0 ):
        print ("Review %d of %d\n" % ( i+1, num_reviews ))                                                                    
    meaningful_clean_reviews.append(checkEnglishDict(clean_reviews[i],english_vocab))

Review 5000 of 20000

Review 10000 of 20000

Review 15000 of 20000

Review 20000 of 20000



In [219]:
i=3
print(texts[i])
print("\n***")
print(clean_reviews[i])
print("\n***")
print(meaningful_clean_reviews[i])

Location is everything and this hotel has it! The reception is inviting and open 24 hours. They are very helpful and have a lot of patience answering all my questions about where to go etc. there is also a lounge open 24 hours with snack-type food. Breakfast is continental-style so if you want heartier fare look elsewhere though you don't have to go far. The bus and train stations are right across the street so it's easy access to the airport or anywhere else you may want to go. Turn uphill to old town or cross the bridge to new town. The room with a view i got was spacious and comfortable though it's a bit of a maze to find it-just follow the signs. The windows are double paned so the room is quiet plus i was on the 5th floor which helps. It's a bit pricey but still one of the best values i found!

***
location everything hotel reception inviting open hours helpful lot patience answering questions go etc also lounge open hours snack type food breakfast continental style want heartier 

In [220]:
print ("Creating the bag of words...\n")
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 20000) 

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
data_features = vectorizer.fit_transform(meaningful_clean_reviews)

data_features = data_features.toarray()
print("Created")

# Take a look at the words in the vocabulary
vocab = vectorizer.get_feature_names()

# Sum up the counts of each vocabulary word
dist = np.sum(data_features, axis=0)

# For each, print the vocabulary word and the number of times it 
# appears in the training set
threshold_count=2
low_freq_words=[]
#for tag, count in zip(vocab, dist):
low_freq_words=[tag  for tag, count in zip(vocab, dist) if(count<threshold_count)]
    #print (count, tag)

Creating the bag of words...

Created


In [226]:
#removes low frequency words from the reviews
num_reviews=len(meaningful_clean_reviews)
for i in range(0,num_reviews):
    if( (i+1)%5000 == 0 ):
        print ("Review %d of %d\n" % ( i+1, num_reviews ))                                                                    
    words = meaningful_clean_reviews[i].split()
    new_words = [w for w in words if not w in low_freq_words]
    meaningful_clean_reviews[i]=" ".join( meaningful_words )

Review 5000 of 20001

Review 10000 of 20001

Review 15000 of 20001

Review 20000 of 20001



In [223]:
print(Counter(stars))
balanced_x, balanced_y = balance_classes(texts, stars)
print(Counter(balanced_y))

from sklearn.feature_extraction.text import TfidfVectorizer
 
# This vectorizer breaks text into single words and bi-grams
# and then calculates the TF-IDF representation
vectorizer = TfidfVectorizer(ngram_range=(1,2))

 
# the 'fit' builds up the vocabulary from all the reviews
# while the 'transform' step turns each indivdual text into
# a matrix of numbers.
vectors = vectorizer.fit_transform(balanced_x)


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(vectors, balanced_y, test_size=0.33, random_state=42)

from sklearn.svm import LinearSVC
 
# initialise the SVM classifier
classifier = LinearSVC()
 
# train the classifier
classifier.fit(X_train, y_train)

preds = classifier.predict(X_test)
print(list(preds[:10]))
print(y_test[:10])

from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, preds))

#import pip
#pip.main(['install','pyenchant'])

import enchant


d = enchant.Dict("en_US")

from enchant.tokenize import get_tokenizer, EmailFilter

tknzr = get_tokenizer("en_US")
[w for w in tknzr("send an email to fake@example.com please")] #[('send', 0), ('an', 5), ('email', 8), ('to', 14), ('fake@example.com', 17), ('please', 34)]
tknzr = get_tokenizer("en_US",[EmailFilter])
[w for w in tknzr("send an email to fake@example.com please")]
#[('send', 0), ('an', 5), ('email', 8), ('to', 14), ('please', 34)]

tknzr

import enchant
import enchant.checker
from enchant.checker.CmdLineChecker import CmdLineChecker
chkr = enchant.checker.SpellChecker("en_US")
chkr.set_text("this is sme example txt")
cmdln = CmdLineChecker()
cmdln.set_checker(chkr)
cmdln.run()

text_file = open(“SlangSD.text”, “r”) 
#print file.read() 

text_file= open("SlangSD.txt","r", encoding= "utf8")
lines = text_file.read()#.split(',')

type(lines)

data = pd.read_csv('SlangSD.txt', sep="\t", header=None,error_bad_lines=False, encoding= "utf8")
#data.columns = ["a", "b", "c", "etc."]

data.iloc[1]

SyntaxError: invalid character in identifier (<ipython-input-223-2641a131b79d>, line 63)

In [None]:
#import pip
#pip.main(['install','stemming'])


from stemming.porter2 import stem

#documents = [[stem(word) for word in sentence.split(" ")] for sentence in documents]

#def stemming(meaningful_clean_review):
 #   ps = PorterStemmer()

from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

ps = PorterStemmer()

print(meaningful_clean_reviews[3])
aa=[]
aa=[stem(w) for w in meaningful_clean_reviews[3].split()]
#    print(ps.stem(w))
bb=" ".join(aa)
bb