In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!unzip /content/drive/MyDrive/data.zip

Archive:  /content/drive/MyDrive/data.zip
  inflating: train.csv               
  inflating: __MACOSX/._train.csv    
  inflating: test.csv                
  inflating: __MACOSX/._test.csv     
  inflating: val.csv                 
  inflating: __MACOSX/._val.csv      


# Data Processing 
We convert the answer into the exact text from the document (Extractive), getting the index of the starting and ending answers

In [None]:
!pip install -q transformers 
!pip install -q sentence-transformers

In [None]:
from ast import literal_eval

In [None]:
import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer
from nltk import tokenize
import numpy as np
import pickle
import transformers
import torch
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tfidf_vectorizer = TfidfVectorizer(stop_words ='english')
sentence_t = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
def add_features(df):
  #HELPERS
  def intersec(q,r):
    return len([x for x in q if x in r])
  
  def vectorize(text):
    return tfidf_vectorizer.transform([text]).toarray()[0]

  def tokenize(text):
    return tokenizer(text,max_length=512,truncation=True).get('input_ids')
  
  def sentenceTransform(text):
    return sentence_t.encode(text)

  def cosine_similarity(a,b):
    return np.dot(a,b)/(np.linalg.norm(a) * np.linalg.norm(b))
  
  def dotproduct(a,b):
    return np.dot(a,b)
  
  def euclid_dist(a,b):
    return np.linalg.norm(a-b)
  
  def sentence_mean_max(model,q,r):
    f = dotproduct
    vals = [f(q,model(sentence)) for sentence in nltk.tokenize.sent_tokenize(r)]
    if not vals:
      return pd.Series([np.mean(vals), np.max(vals)])
    else:
      return pd.Series([[],[]])

  df = df.copy()
  df['question_ntokens'] = df['questionText'].apply(lambda x:len(tokenize(x)))
  df['review_ntokens'] = df['review_snippets'].apply(lambda x:len(tokenize(x)))
  df['question_tokens'] = df['questionText'].apply(lambda x:tokenize(x))
  df['review_tokens'] = df['review_snippets'].apply(lambda x:tokenize(x))
  df['intersec'] = df.apply(lambda row: intersec(row['questionText'], row['review_snippets']),axis=1)
  df['intersec_pct'] = df['intersec'] / df['question_ntokens']
  df['question_encoded'] = df['questionText'].apply(lambda x: sentenceTransform(x))
  df['review_encoded'] = df['review_snippets'].apply(lambda x: sentenceTransform(x))
  df['question_tfidf'] = df['questionText'].apply(lambda x: vectorize(x))
  df['review_tfidf'] = df['review_snippets'].apply(lambda x: vectorize(x))

  for m in ['encoded','tfidf']:
    df[f'cosine_sim_{m}'] = df.apply(lambda row: cosine_similarity(row[f'question_{m}'], row[f'review_{m}']),axis =1 )
    df[f'dot_prod_{m}'] = df.apply(lambda row: dotproduct(row[f'question_{m}'], row[f'review_{m}']),axis =1 )
    df[f'euclid_dist_{m}'] = df.apply(lambda row: euclid_dist(row[f'question_{m}'], row[f'review_{m}']),axis =1 )
  df[['sent_max_encoded','sent_mean_encoded']] = df.apply(lambda row: sentence_mean_max(sentenceTransform, row['question_encoded'], row['review_snippets']),axis =1 )
  df[['sent_max_tfidf','sent_mean_tfidf']] = df.apply(lambda row: sentence_mean_max(vectorize, row['question_tfidf'], row['review_snippets']),axis =1 )

  df.drop(['question_tokens','review_tokens','question_encoded','review_encoded','question_tfidf','review_tfidf'],axis =1 , inplace = True)
  return df


In [None]:
train_df = pd.read_csv('train.csv').sample(frac = 1).dropna()

In [None]:
train_df = pd.concat([train_df[train_df['is_answerable'] == 1].iloc[:4000],train_df[train_df['is_answerable'] != 1].iloc[:4000]])

In [None]:
train_df = train_df.sample(frac =1)

In [None]:
X = train_df.drop('is_answerable',axis=1)
y = train_df['is_answerable'].values

In [None]:
tfidf_vectorizer.fit(X['questionText'].tolist() + X['review_snippets'].tolist())

TfidfVectorizer(stop_words='english')

In [None]:
def feature_generations(df):
  df = df[['review_snippets','questionText','is_answerable']].copy()
  df['review_snippets'] = df['review_snippets'].apply(lambda x: '    '.join(literal_eval(x))) #4 spaces
  X = df.drop('is_answerable',axis=1)
  y = df['is_answerable'].values
  X = add_features(X)
  return X, y

In [None]:
X_train,y_train = feature_generations(train_df)

  return np.dot(a,b)/(np.linalg.norm(a) * np.linalg.norm(b))


In [None]:
df = X_train
df['Y'] = y_train
df.to_csv('processed1.csv')

In [None]:
from google.colab import files
files.download('processed1.csv') 