In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import pandas as pd
from nltk import pos_tag
from nltk.stem import PorterStemmer
import sklearn

In [2]:
yelpdata = pd.read_csv("YelpData/Yelp_train.csv")
print(yelpdata.columns)

Index(['y', 'name', 'text', 'date', 'useful', 'funny', 'cool', 'city',
       'longitude', 'latitude', 'categories', 'nchar', 'nword',
       'sentiment_score', 'gem', 'incredible', 'perfection', 'phenomenal',
       'divine', 'die', 'highly', 'superb', 'heaven', 'amazing', 'favorites',
       'sourced', 'perfect', 'knowledgeable', 'gross', 'poorly', 'response',
       'flavorless', 'waste', 'terrible', 'tasteless', 'rude', 'awful',
       'inedible', 'horrible', 'apology', 'disgusting', 'worst'],
      dtype='object')


In [3]:
yelpdata[['text', 'y']]

Unnamed: 0,text,y
0,Spectacular! Was here for New Year's Eve dinn...,5
1,We've been waiting for 1847 Stamm House to ope...,5
2,I was visiting from out of state and had this ...,1
3,We had a great sampling of the menu. The beef ...,5
4,We tried the relaunched Stamm House last night...,4
5,I too am hoping that this unique and beautiful...,3
6,I had high hopes that these guys would do righ...,3
7,Excellent food and beautifully redone. Service...,5
8,I've been to 1847 at the Stamm House five time...,2
9,A big thank you to the owners who kept the ori...,5


In [4]:
text = yelpdata['text']
labels = yelpdata['y']

In [5]:
def prat_lemmatize(token,tag):
	Noun_tags = ['NN','NNP','NNPS','NNS']
	Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']
	lemmatizer = WordNetLemmatizer()
	if tag in Noun_tags:
		return lemmatizer.lemmatize(token,'n')
	elif tag in Verb_tags:
		return lemmatizer.lemmatize(token,'v')
	else:
		return lemmatizer.lemmatize(token,'n')

In [6]:
def preprocessing(text):
	text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())
	tokens = [word for sent in nltk.sent_tokenize(text2) for word in nltk.word_tokenize(sent)]
	tokens = [word.lower() for word in tokens]
	stopwds = stopwords.words('english')
	tokens = [token for token in tokens if token not in stopwds]
	tokens = [word for word in tokens if len(word)>=3]
	stemmer = PorterStemmer()
	tokens = [stemmer.stem(word) for word in tokens]
	tagged_corpus = pos_tag(tokens)
	pre_proc_text =   " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])
	return pre_proc_text

In [7]:
x_preprocessed = []
for i in text.values:
	x_preprocessed.append(preprocessing(i))

In [8]:
x_preprocessed

['spectacular new year eve dinner back daughter birthday even better crab cervich incred fresh restrain prep crab sirloin steak dinner qualiti outstand flawlessli cook medium rare serv green salad lightli perfectli dress even salt bake potato roll melt mouth good dessert sublim rave review rum raisin brioch bread pud see soon stamm hous',
 'wait 1847 stamm hous open glad miss open kink work review mention go last night fabul fish fri new take wisconsin fav whole piec whitefish delici cole slaw pick side amaz wait return tri rest menu mayb night quiet one glad see place hop',
 'visit state place recommend friday night fish fri mistak tell phone cod fish fri person tell cod cost bluegil bill come inform cod one lousi piec cod outrag overpr also order moscow mule serv glass instead customari copper mug',
 'great sampl menu beef stew great valu realli good well season plenti take home nightli special chicken dumpl except stuf bring put fork stuf start order farmer veget order spaetzl veget

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 2), stop_words='english', max_features = 10000, strip_accents='unicode', norm='l2')
x_2 = vectorizer.fit_transform(x_preprocessed).todense()

In [11]:
x_2.shape

(36692, 10000)

In [12]:
x_2

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [13]:
print(x_2.shape)
print(yelpdata.shape)

(36692, 10000)
(36692, 42)


In [16]:
cols = vectorizer.get_feature_names()

In [17]:
new_train = pd.DataFrame(x_2)
new_train.columns = cols

In [18]:
new_train

Unnamed: 0,00pm,100,10am,10pm,11am,11pm,150,1pm,1st,2011,...,yuck,yum,yum yum,yummi,yummi food,yup,zero,zero star,zone,zucchini
0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
1,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
2,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
3,0.0,0.000000,0.0,0.155983,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.145767
4,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
5,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
6,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
7,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
8,0.0,0.096276,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
9,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000


In [19]:
print(new_train.shape)
print(yelpdata.shape)

(36692, 10000)
(36692, 42)


In [20]:
train = pd.concat([yelpdata, new_train], axis = 1)

In [22]:
train = train.drop(["text", "name", "city", "categories", "date"], axis = 1)

In [24]:
train = train.dropna()
train.head(10)

Unnamed: 0,y,useful,funny,cool,longitude,latitude,nchar,nword,sentiment_score,gem,...,yuck,yum,yum yum,yummi,yummi food,yup,zero,zero star,zone,zucchini
0,5,1,0,0,-89.494781,43.104086,517,45,3.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5,1,0,0,-89.494781,43.104086,423,29,2.4,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,0,0,0,-89.494781,43.104086,441,32,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,2,0,1,-89.494781,43.104086,1758,105,1.076923,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,3,0,1,0,-89.494781,43.104086,710,44,1.833333,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,3,0,0,0,-89.494781,43.104086,1486,81,-0.428571,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,5,0,0,0,-89.494781,43.104086,283,19,2.4,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,2,0,0,0,-89.494781,43.104086,1239,81,0.5,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,5,2,0,2,-89.494781,43.104086,800,43,3.333333,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,3,0,0,0,-89.494781,43.104086,1593,98,1.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
import pickle
output = open("train4.pkl", "wb")
pickle.dump(train, output)