### Exploratory Data Analysis 
This notebook contains code that explores the [Amazon Reviews Dataset](https://jmcauley.ucsd.edu/data/amazon_v2/index.html).


Pinecone: Descriptions + reviews 

Finetuning: Question and Answer (context is reviews and descriptions)

In [None]:
!pip install -U sentence-transformers
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [1]:
import pandas as pd 
import gzip
import json
import requests
from io import BytesIO, StringIO
import urllib.request
import numpy as np
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [None]:
_ = urllib.request.urlretrieve("https://amazon-qa.s3-us-west-2.amazonaws.com/train-qar.jsonl", "train-qar.jsonl")
_ = urllib.request.urlretrieve("https://amazon-qa.s3-us-west-2.amazonaws.com/val-qar.jsonl", "var-qar.jsonl")

In [2]:
def get_data(url,N,downloaded):
  im_path = url.split('/')[-1]
  final_path = im_path.replace('.gz','')
  if not downloaded:
    _ = urllib.request.urlretrieve(url, im_path)
    with gzip.open(im_path, 'rb') as infile:
      with open(final_path, 'wb') as outfile:
          for line in infile:
              outfile.write(line)
  it = pd.read_json(final_path,chunksize = 1000,lines= True)
  first_n_rows = pd.DataFrame()
  for chunk in it:
      first_n_rows = first_n_rows.append(chunk.head(N))
      if len(first_n_rows) >= N:
          break
  return first_n_rows

In [4]:
desc = get_data('https://jmcauley.ucsd.edu/data/amazon_v2/metaFiles2/meta_Toys_and_Games.json.gz',500000,True) #first 500000 rows

In [5]:
desc['description'] = desc['description'].apply(lambda item: ' '.join([y for y in ''.join(item).split('<') if '>' not in y]))

In [6]:
df_desc = desc[['title','asin', 'description']].drop_duplicates()

In [7]:
df_ids = desc['asin'].unique().tolist()

In [10]:
def parse_jsonl(path, df_ids):
  N = 2000
  counter = 0
  dfs = []
  with open(path) as f:
    for i, line in enumerate(f):
      try:
        if json.loads(line).get('asin') in df_ids:
          dfs.append(pd.json_normalize(json.loads(line)))
          counter += 1
        if counter > N:
          break
      except:
        pass
  return dfs


In [None]:
train = parse_jsonl('train-qar.jsonl',df_ids)
var = parse_jsonl('var-qar.jsonl',df_ids)
test = parse_jsonl('test-qar_all.jsonl',df_ids)

In [None]:
master = pd.concat(train + var + test).merge(df_desc, how = 'left', on = 'asin')

In [None]:
df = master[master['is_answerable'] == 1].explode('answers')
df['answers'] = df['answers'].apply(lambda x: x.get('answerText'))

In [None]:
model_data = master[['review_snippets','questionText','is_answerable']].copy()
model_data['review_snippets'] = model_data['review_snippets'].apply(lambda x: '    '.join(x)) #4 spaces

In [None]:
model_data['is_answerable'].value_counts()

In [None]:
X = model_data.drop('is_answerable',axis=1)
y = model_data['is_answerable'].values

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42,shuffle = True,stratify = y)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer
from nltk import tokenize
import pickle

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tfidf_vectorizer = TfidfVectorizer(stop_words ='english')
sentence_t = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

In [None]:
tfidf_vectorizer.fit(X_train['questionText'].tolist() + X_train['review_snippets'].tolist())

In [None]:
def add_features(df):
  #HELPERS
  def intersec(q,r):
    return len([x for x in q if x in r])
  
  def vectorize(text):
    return tfidf_vectorizer.transform([text]).toarray()[0]

  def tokenize(text):
    return tokenizer(text,max_length=512,truncation=True).get('input_ids')
  
  def sentenceTransform(text):
    return sentence_t.encode(text)

  def cosine_similarity(a,b):
    return np.dot(a,b)/(np.linalg.norm(a) * np.linalg.norm(b))
  
  def dotproduct(a,b):
    return np.dot(a,b)
  
  def euclid_dist(a,b):
    return np.linalg.norm(a-b)
  
  def sentence_mean_max(model,q,r):
    f = dotproduct
    vals = [f(q,model(sentence)) for sentence in tokenize.sent_token]
    return pd.Series([np.mean(vals), np.max(vals)])

  df = df.copy()
  df['question_ntokens'] = df['questionText'].apply(lambda x:len(tokenize(x)))
  df['review_ntokens'] = df['review_snippets'].apply(lambda x:len(tokenize(x)))
  df['question_tokens'] = df['questionText'].apply(lambda x:tokenize(x))
  df['review_tokens'] = df['review_snippets'].apply(lambda x:tokenize(x))
  df['intersec'] = df.apply(lambda row: intersec(row['questionText'], row['review_snippets']),axis=1)
  df['intersec_pct'] = df['intersec'] / df['question_ntokens']
  df['question_encoded'] = df['questionText'].apply(lambda x: sentenceTransform(x))
  df['review_encoded'] = df['review_snippets'].apply(lambda x: sentenceTransform(x))
  df['question_tfidf'] = df['questionText'].apply(lambda x: vectorize(x))
  df['review_tfidf'] = df['review_snippets'].apply(lambda x: vectorize(x))

  for m in ['encoded','tfidf']:
    df[f'cosine_sim_{m}'] = df.apply(lambda row: cosine_similarity(row[f'question_{m}'], row[f'review_{m}']),axis =1 )
    df[f'dot_prod_{m}'] = df.apply(lambda row: dotproduct(row[f'question_{m}'], row[f'review_{m}']),axis =1 )
    df[f'euclid_dist_{m}'] = df.apply(lambda row: euclid_dist(row[f'question_{m}'], row[f'review_{m}']),axis =1 )
  df[['sent_max_encoded','sent_mean_encoded']] = df.apply(lambda row: sentence_mean_max(sentenceTransform, row['question_encoded'], row['review_snippets']),axis =1 )
  df[['sent_max_tfidf','sent_mean_tfidf']] = df.apply(lambda row: sentence_mean_max(vectorize, row['question_tfidf'], row['review_snippets']),axis =1 )

  df.drop(['question_tokens','review_tokens','question_encoded','review_encoded','question_tfidf','review_tfidf'],axis =1 , inplace = True)
  return df


In [None]:
X_train = add_features(X_train)

In [None]:
X_train.drop(['review_snippets','questionText'],axis =1 ,inplace = True)

In [None]:
X_test = add_features(X_test)
X_test.drop(['review_snippets','questionText'],axis =1 ,inplace = True)

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import log_loss, roc_auc_score, precision_score

In [None]:
weight = (len(y) - y.sum())/y.sum()
clf = XGBClassifier(random_state = 42,n_jobs = - 1,eval_metric = 'logloss',scale_pos_weight = weight,booster = 'dart')
clf = clf.fit(X_train, y_train,eval_set=[(X_test,y_test)],verbose=1000)

In [None]:
preds = clf.predict(X_test)
print('precision:', precision_score(preds,y_test))
print('roc-auc:', roc_auc_score(preds,y_test))
print('loss:', log_loss(preds,y_test))

In [None]:
with open('classifer.pkl', 'wb') as f:
    pickle.dump(clf, f)
from google.colab import files
files.download('classifer.pkl') 

In [None]:
# master['pct'] = master.apply(lambda row: len(list(set(row['description'].split(' ')) & set(row['answers'].split(' ')))) / len(set(row['answers'].split(' '))), axis = 1)
# master[master['pct'] > 0.2] #Get all records where the at least 20% of the answer is inside the product description

In [None]:
from google.colab import files
files.download('data.csv') 