In [1]:
import os
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
from numba import jit, cuda 
import pandas as pd
import tqdm
import numpy as np
import nltk
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from nltk import word_tokenize
from nltk.corpus import stopwords
import pickle
import os.path
nltk.download('stopwords')
nltk.download('punkt')
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
train_df = pd.read_csv('../content/gdrive/My Drive/Colab Notebooks/datasets/amazon_dataset/amazon_review_polarity_csv/train.csv')
train_df.columns = ['rating','headline','review']
train_df['merged_text'] = train_df.headline +' ' + train_df.review
train_df.head(10)

Unnamed: 0,rating,headline,review,merged_text
0,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...,The best soundtrack ever to anything. I'm read...
1,2,Amazing!,This soundtrack is my favorite music of all ti...,Amazing! This soundtrack is my favorite music ...
2,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...,Excellent Soundtrack I truly like this soundtr...
3,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine...","Remember, Pull Your Jaw Off The Floor After He..."
4,2,an absolute masterpiece,I am quite sure any of you actually taking the...,an absolute masterpiece I am quite sure any of...
5,1,Buyer beware,"This is a self-published book, and if you want...","Buyer beware This is a self-published book, an..."
6,2,Glorious story,I loved Whisper of the wicked saints. The stor...,Glorious story I loved Whisper of the wicked s...
7,2,A FIVE STAR BOOK,I just finished reading Whisper of the Wicked ...,A FIVE STAR BOOK I just finished reading Whisp...
8,2,Whispers of the Wicked Saints,This was a easy to read book that made me want...,Whispers of the Wicked Saints This was a easy ...
9,1,The Worst!,A complete waste of time. Typographical errors...,The Worst! A complete waste of time. Typograph...


In [4]:
test_df = pd.read_csv('../content/gdrive/My Drive/Colab Notebooks/datasets/amazon_dataset/amazon_review_polarity_csv/test.csv')
test_df.columns = ['rating','headline','review']
test_df['merged_text'] = test_df.headline +' ' + test_df.review
test_df.head(10)

Unnamed: 0,rating,headline,review,merged_text
0,2,One of the best game music soundtracks - for a...,Despite the fact that I have only played a sma...,One of the best game music soundtracks - for a...
1,1,Batteries died within a year ...,I bought this charger in Jul 2003 and it worke...,Batteries died within a year ... I bought this...
2,2,"works fine, but Maha Energy is better",Check out Maha Energy's website. Their Powerex...,"works fine, but Maha Energy is better Check ou..."
3,2,Great for the non-audiophile,Reviewed quite a bit of the combo players and ...,Great for the non-audiophile Reviewed quite a ...
4,1,DVD Player crapped out after one year,I also began having the incorrect disc problem...,DVD Player crapped out after one year I also b...
5,1,Incorrect Disc,"I love the style of this, but after a couple y...","Incorrect Disc I love the style of this, but a..."
6,1,DVD menu select problems,I cannot scroll through a DVD menu that is set...,DVD menu select problems I cannot scroll throu...
7,2,Unique Weird Orientalia from the 1930's,"Exotic tales of the Orient from the 1930's. ""D...",Unique Weird Orientalia from the 1930's Exotic...
8,1,"Not an ""ultimate guide""","Firstly,I enjoyed the format and tone of the b...","Not an ""ultimate guide"" Firstly,I enjoyed the ..."
9,2,Great book for travelling Europe,"I currently live in Europe, and this is the bo...",Great book for travelling Europe I currently l...


In [5]:
##listing the unique values
train_df.rating.unique()

array([2, 1])

In [0]:
X_train = train_df.merged_text
y_train = train_df.rating

X_test = test_df.merged_text
y_test = test_df.rating

In [0]:
del train_df
del test_df

## 1. Glove pretrained embeddings


```
# download from here
!wget http://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip
```



In [0]:
#!wget http://www-nlp.stanford.edu/data/glove.840B.300d.zip
#!unzip glove.840B.300d.zip

In [0]:
###load embeddings
embeddings_index = {}

In [0]:
def load_embeddings():
  f = open('glove.840B.300d.txt')
  for line in tqdm.tqdm(f):
    try:
      values = line.split()
      word = values[0]
      vec = np.asarray(values[1:], dtype='float32')
      embeddings_index[word] = vec
    except:
      continue
  f.close()

if(os.path.isfile('embedding_index.pickle')):
  pickle_in = open("embedding_index.pickle","rb")
  embeddings_index = pickle.load(pickle_in)
else:
  load_embeddings()
  pickle_out = open("embedding_index.pickle","wb")
  pickle.dump(embeddings_index, pickle_out)
  pickle_out.close()

In [0]:
def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())

In [12]:
# create sentence vectors using the above function for training and test set
xtrain_glove = np.array([sent2vec(x) for x in tqdm.tqdm(X_train[:932390])]) 
xtest_glove = np.array([sent2vec(x) for x in tqdm.tqdm(X_test[:200000])])

100%|██████████| 932390/932390 [15:43<00:00, 988.52it/s] 
100%|██████████| 200000/200000 [03:19<00:00, 1004.56it/s]


In [0]:
#free up memory
del X_train
del embeddings_index
del X_test

In [21]:
# Fitting a simple xgboost on glove features
clf = xgb.XGBClassifier(nthread=-1, max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, learning_rate=0.1, silent=False)
clf.fit(xtrain_glove, y_train[:932390].values)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=7,
              min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
              nthread=-1, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=False, subsample=0.8, verbosity=1)

In [23]:
predictions = clf.predict_proba(xtest_glove)
print ("roc score: %0.3f " % roc_auc_score(y_test[:200000].values, predictions[:,1]))

roc score: 0.927 
