## Reading Libraries 

In [33]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline 
import nltk
from nltk.tokenize import MWETokenizer
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn import metrics
import numpy as np
from nltk.stem import WordNetLemmatizer
import re
from nltk.tokenize import RegexpTokenizer
import gensim
import logging
from gensim.models.doc2vec import LabeledSentence
from gensim.models import word2vec
from tqdm import tqdm

In [None]:
nltk.download('wordnet')

In [2]:
lab_data = pd.read_csv('./Training Dataset-20191010/labeled_data.csv')
# unlabeled_data = pd.read_csv('./Training Dataset-20191010/unlabeled_data.csv')

In [None]:
lab_data.head()['text']

In [None]:
lab_data['text'][1]

In [3]:
def remove_extra_characters(raw_text):
    processed_text = re.sub('\\n','', raw_text)
    processed_text = re.sub('\\r','', processed_text)
    processed_text = re.sub("\\'", "\'",processed_text)
    return processed_text

In [4]:
lab_data['text'] = lab_data.apply(lambda row: remove_extra_characters(row['text'].strip()), axis=1)

## Case Normalisation, Tokenization and Stop words removal

In [None]:
stopwords = []
with open('./stopwords_en.txt') as f:
    stopwords = f.read().splitlines()
stopwords = set(stopwords)

In [5]:
def lemmatization(token_list):
    lemmatizer = WordNetLemmatizer()
    lem_token = []
    for each in token_list :
#         print(each ,":", lemmatizer.lemmatize(each)) 
        lem_token.append(lemmatizer.lemmatize(each))
    return lem_token

In [6]:
tokenizer = RegexpTokenizer("\w+(?:[']\w+)?")

In [7]:
def token(raw_data):
    raw_data1 = raw_data.lower()
    tokenised = tokenizer.tokenize(raw_data1)
#     tokenised = nltk.tokenize.word_tokenize(raw_data1)
    lem_token = lemmatization(tokenised)
#     stopwords_tokens = [w for w in tokenised if not w in stopwords]
    processed_data = ' '.join(lem_token)
        
    return(processed_data)

In [8]:
lab_data['text'] = lab_data.apply(lambda row: token(row['text'].strip()), axis=1)

In [None]:
lab_data['text'][1]

## TFIDF + Logistic Regression

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer 


vectorizer = TfidfVectorizer(lowercase = True,analyzer = 'word',ngram_range = (1,2))
    
train_review = vectorizer.fit_transform(lab_data['text'])

In [16]:
X_train, X_test, y_train, y_test = train_test_split(train_review, lab_data['label'],test_size=0.20, random_state=1)

In [None]:
def instantiate_cross_val(model):
    # perfroming 10 fold cross validation
    skf = StratifiedKFold(n_splits=10)
    params = {}
    nb = model
    gs = GridSearchCV(nb, cv=skf, param_grid=params, return_train_score=False)
    return gs

In [None]:
model = LogisticRegression()
gs = instantiate_cross_val(model)

clf=gs.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

In [None]:
print('parameters:', clf.best_estimator_.get_params())

In [None]:
# Create regularization penalty space
penalty = ['l1', 'l2']

# Create regularization hyperparameter space
C = np.logspace(0, 4, 10)

# multi_class = ['multinomial','ovr']


# Create hyperparameter options
hyperparameters = dict(C=C, penalty=penalty)

In [None]:
clf = GridSearchCV(model, hyperparameters, cv=10, verbose=0)

In [None]:
best_model = clf.fit(X_train, y_train)

In [None]:
# View best hyperparameters
print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])
print('Best C:', best_model.best_estimator_.get_params()['C'])

In [None]:
print('Best C:', best_model.best_estimator_.get_params())

In [None]:
y_best_pred = best_model.predict(X_test)

In [None]:
print("Accuracy:", metrics.accuracy_score(y_test, y_best_pred))

In [17]:
log_model = LogisticRegression(random_state=1, C=1.5, solver='sag', multi_class = 'multinomial')
log_model.fit(X_train, y_train)
y_pred = log_model.predict(X_test)

In [18]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.6111


In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    import itertools
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

In [None]:
plt.figure()
plot_confusion_matrix(cm, classes=['1', '2', '3', '4', '5'],
                      title='Confusion matrix')

In [None]:
pred_probab = clf.predict_proba(test_review)

In [None]:
pred_probab

In [None]:
p_test = []

In [None]:
for i in range(len(pred_probab)):
    p_test.append(max(pred_probab[i]))

In [None]:
train_data = pd.DataFrame({'text':X_test, 'label':y_test, 'p_test':p_test, 'y_pred':y_pred})
train_data.head()

In [None]:
len(train_data[(train_data['p_test'] > 0.9) & (train_data['label']==train_data['y_pred'])])

## Word2vec + Logistic

In [10]:
lab_data['text'][1]

"flirted with giving this two star but that's a pretty damning rating for what might have just been an off night new to the east side and so we don't know many of these hidden gem but me and the fiance met her friend for drink here and ended up getting some thing to nibble first off service wa pretty slow which wa unusual because the restaurant is pretty small and galley style you would think it would be easy for server to routinely hit up table a you pas by the fiance ordered the quinoa salad and said it wa pretty good but dry i wasn't too hungry and so i simply ordered the bruchetta 3 way which came with burnt crostinis and i ordered a side of fry which were either hard or chewy the friend ordered the macaroni cheese and added chicken and bacon her usual order and liked it can't remember the last time i thought to myself huh they failed at fry so like i said two star but the decor wa good it wa a good place to have a conversation and i might be back to try more expensive fare but ah 

In [11]:
sentences = []
for review in lab_data['text']:
    sentences.append(review.split(' '))

In [12]:
sentences[1]

['flirted',
 'with',
 'giving',
 'this',
 'two',
 'star',
 'but',
 "that's",
 'a',
 'pretty',
 'damning',
 'rating',
 'for',
 'what',
 'might',
 'have',
 'just',
 'been',
 'an',
 'off',
 'night',
 'new',
 'to',
 'the',
 'east',
 'side',
 'and',
 'so',
 'we',
 "don't",
 'know',
 'many',
 'of',
 'these',
 'hidden',
 'gem',
 'but',
 'me',
 'and',
 'the',
 'fiance',
 'met',
 'her',
 'friend',
 'for',
 'drink',
 'here',
 'and',
 'ended',
 'up',
 'getting',
 'some',
 'thing',
 'to',
 'nibble',
 'first',
 'off',
 'service',
 'wa',
 'pretty',
 'slow',
 'which',
 'wa',
 'unusual',
 'because',
 'the',
 'restaurant',
 'is',
 'pretty',
 'small',
 'and',
 'galley',
 'style',
 'you',
 'would',
 'think',
 'it',
 'would',
 'be',
 'easy',
 'for',
 'server',
 'to',
 'routinely',
 'hit',
 'up',
 'table',
 'a',
 'you',
 'pas',
 'by',
 'the',
 'fiance',
 'ordered',
 'the',
 'quinoa',
 'salad',
 'and',
 'said',
 'it',
 'wa',
 'pretty',
 'good',
 'but',
 'dry',
 'i',
 "wasn't",
 'too',
 'hungry',
 'and',
 's

In [13]:
# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 10   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
# context = 10          # Context window size                                                                                    

# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print("Training model...")
model = word2vec.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
# model.init_sims(replace=True)

Training model...


In [14]:
model.doesnt_match("man woman child kitchen".split())

  """Entry point for launching an IPython kernel.


'kitchen'

In [15]:
model.wv.syn0.shape

  """Entry point for launching an IPython kernel.


(12109, 200)

In [None]:
from itertools import islice
list(islice(model.wv.vocab, 11030, 13050))

In [19]:
train, test = train_test_split(lab_data, test_size=0.3, random_state = 42)

In [20]:
def w2v_tokenize_text(text):
    tokens = text.split(' ')
    return tokens

test_tokenized = test.apply(lambda r: w2v_tokenize_text(r['text']), axis=1).values
train_tokenized = train.apply(lambda r: w2v_tokenize_text(r['text']), axis=1).values

In [21]:
# test_tokenized = test['text'].values
train_tokenized

array([list(['i', 'found', 'this', 'place', 'to', 'be', 'overpriced', 'especially', 'the', 'food', 'the', 'appetizer', 'we', 'got', 'were', 'not', 'worth', 'the', 'money', 'especially', 'when', 'it', 'came', 'to', 'the', 'portion', 'if', 'i', 'do', 'come', 'back', "i'll", 'make', 'sure', 'to', 'eat', 'before', 'i', 'go', 'but', 'i', 'think', 'this', 'would', 'be', 'a', 'good', 'place', 'to', 'go', 'when', 'the', 'weather', 'is', 'nice', 'since', 'the', 'back', 'patio', 'look', 'relaxing', 'if', 'it', "weren't", 'for', 'that', "i'm", 'not', 'sure', 'i', 'would', 'return']),
       list(['delicious', 'food', 'good', 'coffee', 'very', 'friendly', 'food', 'portion', 'were', 'average', 'for', 'la', 'vega', 'my', 'youngest', 'had', 'pancake', 'with', 'egg', 'and', 'bacon', 'pancake', 'were', 'large', 'fluffy', 'and', 'moist', 'hubby', 'had', 'corn', 'beef', 'hash', 'with', 'over', 'easy', 'egg', 'and', 'it', 'wa', 'hash', 'a', 'i', 'have', 'never', 'seen', 'it', 'like', 'cut', 'up', 'slice',

In [22]:
def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.vocab:
            mean.append(wv.syn0norm[wv.vocab[word].index])
            all_words.add(wv.vocab[word].index)

    if not mean:
        logging.warning("cannot compute similarity with no input %s", words)
        # FIXME: remove these examples in pre-processing
        return np.zeros(wv.vector_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def  word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, review) for review in text_list ])

In [23]:
X_train_word_average = word_averaging_list(model.wv,train_tokenized)
X_test_word_average = word_averaging_list(model.wv,test_tokenized)

  


In [26]:
logreg = LogisticRegression(C=1.5, solver='sag', multi_class = 'multinomial')
logreg.fit(X_train_word_average, train['label'])
y_pred = logreg.predict(X_test_word_average)
print('accuracy %s' % metrics.accuracy_score(y_pred, test.label))
# print(classification_report(test.label, y_pred,target_names=my_tags))

accuracy 0.5736
