In [1]:
import pandas as pd
import numpy as np
from afinn import Afinn
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [2]:
yelpdata = pd.read_csv("YelpData/Yelp_train.csv")

In [3]:
afinn = Afinn()

def bestWords(list1, N):
    if N >= len(list1):
        return list1
    final_list = [] 
    for i in range(0, N):  
        max1 = ""    
        for j in range(len(list1)):  
            if max1 == "":
                max1 = list1[j]
            elif abs(afinn.score(list1[j])) > abs(afinn.score(max1)): 
                max1 = list1[j];               
        list1.remove(max1); 
        final_list.append(max1)      
    return final_list

def removeStopWords(sentence):
    sentence = word_tokenize(sentence)
    newSentence = []
    for word in sentence:
        if word in stopwords.words("english"):
            continue
        newSentence.append(word)
    return newSentence

def cleanData(review, k):
    # remove punctuation and extraneous characters
    REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
    REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
    # remove excess whitespace
    review = review.strip()
    # lowercase 
    review = REPLACE_NO_SPACE.sub("", review.lower())
    review = REPLACE_WITH_SPACE.sub(" ", review)
    # remove newline characters
    review = review.replace("\n", " ")
    # remove stop words
    review = removeStopWords(review)
    # score all the words from afinn dictionary and pick k best words
    review = bestWords(review, k)
    return " ".join(review)

In [4]:
n = 10
reviews = yelpdata['text'].values.tolist()
reviews = list(map(lambda x: cleanData(x, n), reviews))

In [6]:
reviews = np.array(reviews)
reviews.shape

(36692,)

In [7]:
reviews[:10]

array(['outstanding perfectly good better quality flawlessly fresh spectacular new years',
       'fabulous amazing glad delicious glad miss weve waiting 1847 stamm',
       'recommended mistake lousy visiting state place friday night fish fry',
       'great great good good good good better sampling menu beef',
       'charming delightfully kind odd hoping recommend lack true highlight sweet',
       'beautiful good loved best hoping problems worth improvement noisy unique',
       'winner good good good good good good awful best hopes',
       'excellent beautifully good best enjoyed creative tender improves food redone',
       'perfect worst delicious love liked hoping disappointing improved disappointed optimistic',
       'awesome awesome great good thank friendly big owners kept original'],
      dtype='<U108')

In [8]:
cv = CountVectorizer(binary=True)
cv.fit(reviews)
X = cv.transform(reviews).toarray()

In [10]:
X.shape

(36692, 11950)

In [11]:
new_cols = cv.get_feature_names()
print(new_cols[:10])
new_train = pd.DataFrame(X)
new_train.columns = new_cols

['03', '04', '045', '05', '06', '07', '09', '10', '100', '1000']


In [12]:
new_train

Unnamed: 0,03,04,045,05,06,07,09,10,100,1000,...,zuzu,zuzus,zwang,日本に住んだ事ある方々でも満足すると思いますよ,日本人,日本人の方が経営していると思いますよ,普通にとっても美味しかったです,浅草,蔥油餅,黑店
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
print(new_train.shape)
print(yelpdata.shape)

(36692, 11950)
(36692, 42)


In [20]:
x = yelpdata.drop(["text", "name", "city", "categories", "date"], axis = 1)

In [21]:
train = pd.concat([x, new_train], axis = 1)

In [23]:
train.head(10)

Unnamed: 0,y,useful,funny,cool,longitude,latitude,nchar,nword,sentiment_score,gem,...,zuzu,zuzus,zwang,日本に住んだ事ある方々でも満足すると思いますよ,日本人,日本人の方が経営していると思いますよ,普通にとっても美味しかったです,浅草,蔥油餅,黑店
0,5,1,0,0,-89.494781,43.104086,517,45,3.0,0,...,0,0,0,0,0,0,0,0,0,0
1,5,1,0,0,-89.494781,43.104086,423,29,2.4,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,-89.494781,43.104086,441,32,0.0,0,...,0,0,0,0,0,0,0,0,0,0
3,5,4,0,3,-89.494781,43.104086,815,48,,0,...,0,0,0,0,0,0,0,0,0,0
4,4,2,0,1,-89.494781,43.104086,1758,105,1.076923,0,...,0,0,0,0,0,0,0,0,0,0
5,3,0,1,0,-89.494781,43.104086,710,44,1.833333,0,...,0,0,0,0,0,0,0,0,0,0
6,3,0,0,0,-89.494781,43.104086,1486,81,-0.428571,0,...,0,0,0,0,0,0,0,0,0,0
7,5,0,0,0,-89.494781,43.104086,283,19,2.4,0,...,0,0,0,0,0,0,0,0,0,0
8,2,0,0,0,-89.494781,43.104086,1239,81,0.5,0,...,0,0,0,0,0,0,0,0,0,0
9,5,2,0,2,-89.494781,43.104086,800,43,3.333333,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
y = train['y']

In [24]:
import pickle
output = open("train3.pkl", "wb")
pickle.dump(train, output)