# OutdoorSeating

In [1]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, MaxPooling2D
from keras.utils import np_utils
from tensorflow.keras.preprocessing.text import Tokenizer

import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns

yelp_review = "./OutdoorSeating_>49.csv"
res_reviews= pd.read_csv(yelp_review)
res_reviews.head()

Unnamed: 0,text,OutdoorSeating
0,"['Very relax friendly environment, the sandwic...",1
1,['Fun visit. Pizza crust was firm and crunchy....,1
2,['Great new E Milton Square spot - love sittin...,1
3,"[""Customer service was very good; the employee...",1
4,"[""Delicious food and friendly service. We had...",1


In [2]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
stop = stopwords.words('english')
ps = PorterStemmer()

res_reviews['text'] = res_reviews['text'].apply(lambda x: ' '.join([ps.stem(word) for word in x.split() if word not in (stop)]))

[nltk_data] Downloading package stopwords to /Users/aj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Feature generation from BagofWords (CountVectorizer)

In [3]:
#Feature Generation using BagOfWords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
import nltk
nltk.download ('stopwords')
nltk.download ('punkt')
#tokenizer to remove unwanted elements from out data like symbols and numbers
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(lowercase=True,stop_words='english',ngram_range = (1,1),tokenizer = token.tokenize)
text_counts= cv.fit_transform(res_reviews["text"])

[nltk_data] Downloading package stopwords to /Users/aj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/aj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    text_counts, res_reviews['OutdoorSeating'], test_size=0.3, random_state=123) # change attribute

In [16]:
# Modeling Using Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
clf = MultinomialNB().fit(X_train, y_train)
predicted= clf.predict(X_test)
print("BagOfWords MultinomialNB accuracy:",metrics.accuracy_score(y_test, predicted))

BagOfWords MultinomialNB accuracy: 0.6974647887323944


In [17]:
# Modeling using Logistic Regression
from sklearn.linear_model import LogisticRegression
from nltk.classify.scikitlearn import SklearnClassifier
clf = LogisticRegression()
clf.fit(X_train, y_train)
predicted= clf.predict(X_test)

print("BagOfWords LogisticRegression accuracy:", metrics.accuracy_score(y_test, predicted))

BagOfWords LogisticRegression accuracy: 0.7194366197183099


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Feature generation using TF-IDF (TfidfVectorizer)

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf=TfidfVectorizer()
text_tf= tf.fit_transform(res_reviews["text"])

#split and train TF-IDF
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    text_tf, res_reviews["OutdoorSeating"], test_size=0.2, random_state=123) # change attribute

In [19]:
# Modeling Using Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
clf = MultinomialNB().fit(X_train, y_train)
predicted= clf.predict(X_test)
print("TF-IDF MultinomialNB accuracy:",metrics.accuracy_score(y_test, predicted))

TF-IDF MultinomialNB accuracy: 0.6627218934911243


In [20]:
# Modeling using Logistic Regression
from sklearn.linear_model import LogisticRegression
from nltk.classify.scikitlearn import SklearnClassifier
clf = LogisticRegression()
clf.fit(X_train, y_train)
predicted= clf.predict(X_test)

print("TF-IDF LogisticRegression accuracy:", metrics.accuracy_score(y_test, predicted))

TF-IDF LogisticRegression accuracy: 0.7320371935756551
