## Reading Libraries 

In [1]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline 
import nltk
from nltk.tokenize import MWETokenizer
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn import metrics
import numpy as np
from nltk.stem import WordNetLemmatizer
import re
from nltk.tokenize import RegexpTokenizer
import gensim
import logging
from gensim.models.doc2vec import LabeledSentence
from gensim.models import word2vec
from tqdm import tqdm

In [10]:
# %pip install gensim
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\palc0001\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [89]:
lab_data = pd.read_csv('./Training Dataset-20191023/labeled_data.csv')
unlabeled_data = pd.read_csv('./Training Dataset-20191023/unlabeled_data.csv')

In [129]:
lab_data.head()

Unnamed: 0,text,label
0,the new rule is if you are waiting for a table...,4
1,flirted with giving this two star but that's a...,3
2,i wa staying at planet hollywood across the st...,5
3,food is good but price are super expensive buc...,2
4,worse company to deal with they do horrible wo...,1


In [65]:
lab_data['text'][1]

'Flirted with giving this two stars, but that\'s a pretty damning rating for what might have just been an off night...\r\n\r\nNew to the East side, and so we don\'t know many of these hidden gems, but me and the fiance met her friend for drinks here and ended up getting some things to nibble. \r\n\r\nFirst off, service was pretty slow, which was unusual because the restaurant is pretty small and galley style. You would think it would be easy for servers to routinely hit up tables as you pass by. \r\n\r\nThe fiance ordered the Quinoa Salad, and said it was pretty good, but dry. I wasn\'t too hungry and so I simply ordered the Bruchetta 3-way which came with burnt crostinis. And I ordered a side of fries, which were either hard or chewy.\r\n\r\nThe friend ordered the macaroni & cheese, and added chicken and bacon (her usual order) and liked it.  \r\n\r\nCan\'t remember the last time I thought to myself- "Huh... they failed at fries..." So, like I said- two stars. But, the decor was good,

In [67]:
def remove_extra_characters(raw_text):
    processed_text = re.sub('\\n','', raw_text)
    processed_text = re.sub('\\r','', processed_text)
    processed_text = re.sub("\\'", "\'",processed_text)
    processed_text = re.sub(r'\d+','', processed_text)
    return processed_text

In [68]:
lab_data['text'] = lab_data.apply(lambda row: remove_extra_characters(row['text'].strip()), axis=1)

## Case Normalisation, Tokenization and Stop words removal

In [None]:
stopwords = []
with open('./stopwords_en.txt') as f:
    stopwords = f.read().splitlines()
stopwords = set(stopwords)

In [69]:
def lemmatization(token_list):
    lemmatizer = WordNetLemmatizer()
    lem_token = []
    for each in token_list :
#         print(each ,":", lemmatizer.lemmatize(each)) 
        lem_token.append(lemmatizer.lemmatize(each))
    return lem_token

In [70]:
tokenizer = RegexpTokenizer("\w+(?:[']\w+)?")

In [71]:
def token(raw_data):
    raw_data1 = raw_data.lower()
    tokenised = tokenizer.tokenize(raw_data1)
#     tokenised = nltk.tokenize.word_tokenize(raw_data1)
    lem_token = lemmatization(tokenised)
#     stopwords_tokens = [w for w in tokenised if not w in stopwords]
    processed_data = ' '.join(lem_token)
        
    return(processed_data)

In [72]:
lab_data['text'] = lab_data.apply(lambda row: token(row['text'].strip()), axis=1)

In [90]:
lab_data['text'][1]

"flirted with giving this two star but that's a pretty damning rating for what might have just been an off night new to the east side and so we don't know many of these hidden gem but me and the fiance met her friend for drink here and ended up getting some thing to nibble first off service wa pretty slow which wa unusual because the restaurant is pretty small and galley style you would think it would be easy for server to routinely hit up table a you pas by the fiance ordered the quinoa salad and said it wa pretty good but dry i wasn't too hungry and so i simply ordered the bruchetta way which came with burnt crostinis and i ordered a side of fry which were either hard or chewy the friend ordered the macaroni cheese and added chicken and bacon her usual order and liked it can't remember the last time i thought to myself huh they failed at fry so like i said two star but the decor wa good it wa a good place to have a conversation and i might be back to try more expensive fare but ah th

In [91]:
unlabeled_data['text'][0]

"Had a good experience when my wife and I sat at the bar. Great pizza and wings. \r\n\r\nHowever, we tried to go recently with a larger group (8 people) and it was 1.25 hr wait. At 5pm on a Wednesday... Riiiiight.  \r\n\r\nI tried to call ahead and they don't accept call aheads. They apparently only have 1 table capable of seating larger parties. Kinda missed the mark on that one Oreganos. Brand spankin new building and all. \r\n\r\nSo we went across the street to Native NewYorker and got seated immediately."

In [92]:
unlabeled_data['text'] = unlabeled_data.apply(lambda row: remove_extra_characters(row['text'].strip()), axis=1)

In [93]:
unlabeled_data['text'] = unlabeled_data.apply(lambda row: token(row['text'].strip()), axis=1)

In [94]:
unlabeled_data['text'][0]

"had a good experience when my wife and i sat at the bar great pizza and wing however we tried to go recently with a larger group people and it wa hr wait at pm on a wednesday riiiiight i tried to call ahead and they don't accept call aheads they apparently only have table capable of seating larger party kinda missed the mark on that one oregano brand spankin new building and all so we went across the street to native newyorker and got seated immediately"

## TFIDF

In [78]:
from sklearn.feature_extraction.text import TfidfVectorizer 


vectorizer = TfidfVectorizer(lowercase = True,analyzer = 'word',ngram_range = (1,2), min_df=3, max_df=.99)
    
train_review = vectorizer.fit_transform(lab_data['text'])

In [None]:
?TfidfVectorizer

In [79]:
X_train, X_test, y_train, y_test = train_test_split(train_review, lab_data['label'],test_size=0.20, random_state=1)

## Logistic Regression

In [None]:
def instantiate_cross_val(model):
    # perfroming 10 fold cross validation
    skf = StratifiedKFold(n_splits=10)
    params = {}
    nb = model
    gs = GridSearchCV(nb, cv=skf, param_grid=params, return_train_score=False)
    return gs

In [None]:
model = LogisticRegression()
gs = instantiate_cross_val(model)

clf=gs.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

In [None]:
print('parameters:', clf.best_estimator_.get_params())

In [80]:
log_model = LogisticRegression(random_state=1, C=1, solver='sag', multi_class = 'multinomial')
log_model.fit(X_train, y_train)
y_pred = log_model.predict(X_test)

In [81]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.6123


In [None]:
# Create regularization penalty space
penalty = ['l1', 'l2']

# Create regularization hyperparameter space
C = np.logspace(0, 4, 10)

# multi_class = ['multinomial','ovr']


# Create hyperparameter options
hyperparameters = dict(C=C, penalty=penalty)

In [None]:
clf = GridSearchCV(model, hyperparameters, cv=10, verbose=0)

In [None]:
best_model = clf.fit(X_train, y_train)

In [None]:
# View best hyperparameters
print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])
print('Best C:', best_model.best_estimator_.get_params()['C'])

In [None]:
print('Best C:', best_model.best_estimator_.get_params())

In [None]:
y_best_pred = best_model.predict(X_test)

In [None]:
print("Accuracy:", metrics.accuracy_score(y_test, y_best_pred))

## SVM

In [82]:
from sklearn import svm
from sklearn.metrics import classification_report
# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
classifier_linear.fit(X_train, y_train)
prediction_linear = classifier_linear.predict(X_test)
# results
classification_report(y_test, prediction_linear, output_dict=True)

{'1': {'precision': 0.719626168224299,
  'recall': 0.765788165091994,
  'f1-score': 0.741989881956155,
  'support': 2011},
 '2': {'precision': 0.5308947108255067,
  'recall': 0.5327380952380952,
  'f1-score': 0.5318148056449616,
  'support': 2016},
 '3': {'precision': 0.5209549071618037,
  'recall': 0.49570923775870773,
  'f1-score': 0.5080186239006725,
  'support': 1981},
 '4': {'precision': 0.5334665334665335,
  'recall': 0.5415821501014199,
  'f1-score': 0.5374937091092099,
  'support': 1972},
 '5': {'precision': 0.7348717948717949,
  'recall': 0.7094059405940594,
  'f1-score': 0.7219143576826196,
  'support': 2020},
 'accuracy': 0.6097,
 'macro avg': {'precision': 0.6079628229099875,
  'recall': 0.6090447177568553,
  'f1-score': 0.6082462756587237,
  'support': 10000},
 'weighted avg': {'precision': 0.608590066204785,
  'recall': 0.6097,
  'f1-score': 0.6088869791623556,
  'support': 10000}}

In [84]:
print(metrics.accuracy_score( y_test,prediction_linear))

0.6097


## Neural networks

In [None]:
seed = 7
np.random.seed(seed)
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

In [None]:
def batch_generator(X_data, y_data, batch_size):
    dim = X_data.shape[1]
    samples_per_epoch = X_data.shape[0]
    number_of_batches = samples_per_epoch/batch_size
    counter=0
    index = np.arange(np.shape(y_data)[0])
    while 1:
        index_batch = index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X_data[index_batch,:].toarray()
        y_batch = y_data[y_data.index[index_batch]]
        counter += 1
        yield X_batch,y_batch
        if (counter > number_of_batches):
            counter=0
            
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=dim))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit_generator(generator=batch_generator(X_train, y_train, 32),
                    epochs=5, validation_data=(X_test, y_test),
                    steps_per_epoch=X_train.shape[0]/32)

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    import itertools
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

In [None]:
plt.figure()
plot_confusion_matrix(cm, classes=['1', '2', '3', '4', '5'],
                      title='Confusion matrix')

## Predict unlabeled data

In [95]:
unlabeled_data['text'][0]

"had a good experience when my wife and i sat at the bar great pizza and wing however we tried to go recently with a larger group people and it wa hr wait at pm on a wednesday riiiiight i tried to call ahead and they don't accept call aheads they apparently only have table capable of seating larger party kinda missed the mark on that one oregano brand spankin new building and all so we went across the street to native newyorker and got seated immediately"

In [97]:
unlabeled_test = vectorizer.transform(unlabeled_data['text'])

In [103]:
pred_class = log_model.predict(unlabeled_test)

In [113]:
pred_class[10000]

2

In [98]:
pred_probab = log_model.predict_proba(unlabeled_test)

In [112]:
pred_probab[10000]

array([0.23060546, 0.31986946, 0.19747045, 0.13878374, 0.11327089])

In [101]:
p_test = []

In [107]:
for i in range(len(pred_probab)):
    p_test.append(max(pred_probab[i]))

In [124]:
unlabeled_data['label'] = pred_class
unlabeled_data['probability'] = p_test

In [125]:
unlabeled_data.head()

Unnamed: 0,text,label,probability
0,had a good experience when my wife and i sat a...,3,0.404494
1,on my first to montreal with my gf we came her...,4,0.402237
2,one of our favorite place to go when it's cold...,5,0.711828
3,the doctor wa very nice got in in a good amoun...,1,0.605171
4,the nook is an immediate phoenix staple i came...,5,0.771637


In [126]:
new_train_data = unlabeled_data[unlabeled_data['probability'] > 0.8]

In [127]:
new_train_data

Unnamed: 0,text,label,probability
15,a hidden gem great cake love this place good s...,5,0.826098
22,the worst system ever the black box sock only ...,1,0.802013
27,not pleased with customer service we patiently...,1,0.856839
38,i don't write review often i only do it when i...,1,0.898055
88,do not get your policy with them worst custome...,1,0.963723
...,...,...,...
599962,bought a groupon and had a great time my wife ...,5,0.878433
599968,where else will a business answer their phone ...,1,0.844817
599970,worse dme company ever day before my due date ...,1,0.923103
599984,worst mcdonalds i've ever been to on multiple ...,1,0.926845


In [128]:
new_train_data.drop(['probability'], axis=1, inplace=True)
new_train_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,text,label
15,a hidden gem great cake love this place good s...,5
22,the worst system ever the black box sock only ...,1
27,not pleased with customer service we patiently...,1
38,i don't write review often i only do it when i...,1
88,do not get your policy with them worst custome...,1
...,...,...
599962,bought a groupon and had a great time my wife ...,5
599968,where else will a business answer their phone ...,1
599970,worse dme company ever day before my due date ...,1
599984,worst mcdonalds i've ever been to on multiple ...,1


In [130]:
train_data = pd.concat([lab_data, new_train_data])
len(train_data)

106436

In [145]:
train_data

Unnamed: 0,text,label
0,the new rule is if you are waiting for a table...,4
1,flirted with giving this two star but that's a...,3
2,i wa staying at planet hollywood across the st...,5
3,food is good but price are super expensive buc...,2
4,worse company to deal with they do horrible wo...,1
...,...,...
599962,bought a groupon and had a great time my wife ...,5
599968,where else will a business answer their phone ...,1
599970,worse dme company ever day before my due date ...,1
599984,worst mcdonalds i've ever been to on multiple ...,1


In [141]:
from sklearn.feature_extraction.text import TfidfVectorizer 


vectorizer_new = TfidfVectorizer(lowercase = True,analyzer = 'word',ngram_range = (1,2), min_df=3, max_df=.99)
    
train = vectorizer_new.fit_transform(train_data['text'])

In [142]:
X_train, X_test, y_train, y_test = train_test_split(train, train_data['label'],test_size=0.20, random_state=1)

In [143]:
log_model_new = LogisticRegression(random_state=1, C=1, solver='sag', multi_class = 'multinomial')
log_model_new.fit(X_train, y_train)
y_pred = log_model_new.predict(X_test)

In [144]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8129932356257046


## Word2vec + Logistic

In [34]:

train_data_1 = pd.DataFrame({'review':unlabeled_data['text']})
train_data_2 = pd.DataFrame({'review':lab_data['text']})

In [44]:
train_data = pd.concat([train_data_1, train_data_2])
len(train_data)

650000

In [45]:
train_data.head()

Unnamed: 0,review
0,had a good experience when my wife and i sat a...
1,on my first to montreal with my gf we came her...
2,one of our favorite place to go when it's cold...
3,the doctor wa very nice got in in a good amoun...
4,the nook is an immediate phoenix staple i came...


In [47]:
# sentences = []
# for review in lab_data['text']:
#     sentences.append(review.split())
# for review in unlabeled_data['text']:
#     sentences.append(review.split)
sentences = train_data.apply(lambda row: row['review'].split(), axis=1).values

In [48]:
sentences

array([list(['had', 'a', 'good', 'experience', 'when', 'my', 'wife', 'and', 'i', 'sat', 'at', 'the', 'bar', 'great', 'pizza', 'and', 'wing', 'however', 'we', 'tried', 'to', 'go', 'recently', 'with', 'a', 'larger', 'group', '8', 'people', 'and', 'it', 'wa', '1', '25', 'hr', 'wait', 'at', '5pm', 'on', 'a', 'wednesday', 'riiiiight', 'i', 'tried', 'to', 'call', 'ahead', 'and', 'they', "don't", 'accept', 'call', 'aheads', 'they', 'apparently', 'only', 'have', '1', 'table', 'capable', 'of', 'seating', 'larger', 'party', 'kinda', 'missed', 'the', 'mark', 'on', 'that', 'one', 'oregano', 'brand', 'spankin', 'new', 'building', 'and', 'all', 'so', 'we', 'went', 'across', 'the', 'street', 'to', 'native', 'newyorker', 'and', 'got', 'seated', 'immediately']),
       list(['on', 'my', 'first', 'to', 'montreal', 'with', 'my', 'gf', 'we', 'came', 'here', 'to', 'eat', 'a', 'nice', 'mid', 'day', 'lunch', 'before', 'walking', 'around', 'more', 'and', 'departing', 'from', 'montreal', 'now', 'obviously', 'i

In [49]:
from gensim.models import phrases
bigrams = phrases.Phrases(sentences)


In [50]:
print(bigrams["this is the new york".split()])

['this', 'is', 'the', 'new_york']


In [None]:
sentences[0]

In [51]:
# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 5   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
# context = 10          # Context window size                                                                                    

# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print("Training model...")
model = word2vec.Word2Vec(bigrams[sentences], workers=num_workers, \
            size=num_features, min_count=3)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
# model.init_sims(replace=True)

Training model...


In [None]:
?word2vec.Word2Vec

In [52]:
model.doesnt_match("man woman child kitchen".split())

  """Entry point for launching an IPython kernel.
  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'kitchen'

In [53]:
model.wv.syn0.shape

  """Entry point for launching an IPython kernel.


(109094, 300)

In [None]:
from itertools import islice
list(islice(model.wv.vocab, 11030, 13050))

In [54]:
train, test = train_test_split(lab_data, test_size=0.3, random_state = 42)

In [55]:
def w2v_tokenize_text(text):
    tokens = text.split(' ')
    return tokens

test_tokenized = test.apply(lambda r: w2v_tokenize_text(r['text']), axis=1).values
train_tokenized = train.apply(lambda r: w2v_tokenize_text(r['text']), axis=1).values

In [56]:
# test_tokenized = test['text'].values
train_tokenized

array([list(['i', 'found', 'this', 'place', 'to', 'be', 'overpriced', 'especially', 'the', 'food', 'the', 'appetizer', 'we', 'got', 'were', 'not', 'worth', 'the', 'money', 'especially', 'when', 'it', 'came', 'to', 'the', 'portion', 'if', 'i', 'do', 'come', 'back', "i'll", 'make', 'sure', 'to', 'eat', 'before', 'i', 'go', 'but', 'i', 'think', 'this', 'would', 'be', 'a', 'good', 'place', 'to', 'go', 'when', 'the', 'weather', 'is', 'nice', 'since', 'the', 'back', 'patio', 'look', 'relaxing', 'if', 'it', "weren't", 'for', 'that', "i'm", 'not', 'sure', 'i', 'would', 'return']),
       list(['delicious', 'food', 'good', 'coffee', 'very', 'friendly', 'food', 'portion', 'were', 'average', 'for', 'la', 'vega', 'my', 'youngest', 'had', 'pancake', 'with', 'egg', 'and', 'bacon', 'pancake', 'were', 'large', 'fluffy', 'and', 'moist', 'hubby', 'had', 'corn', 'beef', 'hash', 'with', 'over', 'easy', 'egg', 'and', 'it', 'wa', 'hash', 'a', 'i', 'have', 'never', 'seen', 'it', 'like', 'cut', 'up', 'slice',

In [57]:
def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.vocab:
            mean.append(wv.syn0norm[wv.vocab[word].index])
            all_words.add(wv.vocab[word].index)

    if not mean:
        logging.warning("cannot compute similarity with no input %s", words)
        # FIXME: remove these examples in pre-processing
        return np.zeros(wv.vector_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def  word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, review) for review in text_list ])

In [58]:
X_train_word_average = word_averaging_list(model.wv,train_tokenized)
X_test_word_average = word_averaging_list(model.wv,test_tokenized)

  


In [62]:
logreg = LogisticRegression(random_state=1, C=2, solver='sag', multi_class = 'multinomial')
logreg.fit(X_train_word_average, train['label'])
y_pred = logreg.predict(X_test_word_average)
print('accuracy %s' % metrics.accuracy_score(y_pred, test.label))
# print(classification_report(test.label, y_pred,target_names=my_tags))

accuracy 0.5972666666666666


In [63]:
from sklearn import svm
from sklearn.metrics import classification_report
# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
classifier_linear.fit(X_train_word_average, train['label'])
prediction_linear = classifier_linear.predict(X_test_word_average)
# results
classification_report(test.label, prediction_linear, output_dict=True)

{'1': {'precision': 0.7084106369820655,
  'recall': 0.7551087673038892,
  'f1-score': 0.731014677728143,
  'support': 3034},
 '2': {'precision': 0.5207357859531773,
  'recall': 0.519,
  'f1-score': 0.5198664440734557,
  'support': 3000},
 '3': {'precision': 0.48853132488873674,
  'recall': 0.4932595921189077,
  'f1-score': 0.4908840729274166,
  'support': 2893},
 '4': {'precision': 0.5197498354180382,
  'recall': 0.5295103957075789,
  'f1-score': 0.5245847176079734,
  'support': 2982},
 '5': {'precision': 0.7387291444799432,
  'recall': 0.6732449045616306,
  'f1-score': 0.7044685172647258,
  'support': 3091},
 'accuracy': 0.5956666666666667,
 'macro avg': {'precision': 0.5952313455443923,
  'recall': 0.5940247319384013,
  'f1-score': 0.594163685920343,
  'support': 15000},
 'weighted avg': {'precision': 0.5972101432113552,
  'recall': 0.5956666666666667,
  'f1-score': 0.5959632868132543,
  'support': 15000}}