# Create the features DF
* using by_postal_code dataset

# Import

In [None]:
import pandas as pd
import json
import numpy as np
from collections import Counter
import pickle
import math 
from sklearn.model_selection import train_test_split
import torch

In [None]:
RANDOM_STATE = 24

In [None]:
# 導入資料
train_df = pd.read_pickle('../data/Train_by_postoal_code_without_review_pointwise_v3_3.pkl').reset_index(drop=True)
test_df = pd.read_pickle('../data/Test_by_postoal_code_without_review_pointwise_v3_3.pkl').reset_index(drop=True)
postal_code_feature_dict = pickle.load(open("../data/postal_dict.pkl", "rb"))
docs = pickle.load(open("../data/LDA_training_docs.pkl", "rb"))
all_df = pd.read_pickle('../Data/restaurant_only.pkl')

In [None]:
print(train_df.shape, test_df.shape, all_df.shape)

# LDA Model

In [None]:
import spacy
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaMulticore
from gensim.models import CoherenceModel
from gensim import matutils
#python -m spacy download en_core_web_md 

import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

In [None]:
nlp = spacy.load("en_core_web_md")
keep_pos = ['NOUN','ADJ','ADV','VERB']
# removal= ['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE', 'NUM', 'SYM']

def preprocess(text : str) -> list:
    
    for summary in nlp.pipe([text]):
        proj_tok = [token.lemma_.lower() for token in summary \
                    if token.pos_ in keep_pos and not token.is_stop and token.is_alpha]

    return proj_tok

In [None]:
# # unhash if necessary 
# def get_training_docs(all_df):
    
#     docs = []
    
#     for name in Counter(all_df.name):
#         res = all_df[all_df.name == name]
        
#         docs.append(preprocess(''.join(res.text.values.tolist())[:5000]))

#     return docs

# docs = get_training_docs(all_df)
# with open("../data/LDA_training_docs.pkl","wb") as file:
#     # Use pickle.dump() to serialize and save the object to the file
#     pickle.dump(docs, file)

In [None]:
# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=50, no_above=0.5,keep_n=1000)

# Train LDA
# #### doc2Bow
# corpus = [dictionary.doc2bow(doc) for doc in docs]


In [None]:
# print('Number of unique tokens: %d' % len(dictionary))
# print('Number of documents: %d' % len(corpus))

In [None]:
# # Train LDA
# # # building models
# lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=50, \
#                          num_topics=50, workers = 4, passes=10)

# with open('../data/lda_model.pkl', 'wb') as file:
#     # Use pickle.dump() to serialize and save the object to the file
#     pickle.dump(lda_model, file)

In [None]:
lda_model = pickle.load(open('../data/lda_model.pkl', 'rb'))
lda_model.print_topics(-1)

# Review Feature Engineering

In [None]:
import nltk
from nltk.corpus import opinion_lexicon

# Download the General Inquirer lexicon
nltk.download('opinion_lexicon')
general_inquirer_words = set(opinion_lexicon.words())

# Function to check if a string contains General Inquirer words
def contains_general_inquirer(text):
    words = set(text.lower().split())
    common_words = words.intersection(general_inquirer_words)
    return len(common_words) > 0

In [None]:
# # unhash if necessary
# cnt = 0
# for idx , row in all_df.iterrows():
#     # if cnt%10000 == 0:
#     #     print(f'Now progress .... {cnt}')
#     if contains_general_inquirer(row.text):
#         pass
#     else:
#         cnt+=1
#         row.text = ''
# all_df.to_pickle('../Data/restaurant_only_filtered.pkl')

## Get Review Features

In [None]:
def get_LDA_aspects(df , all_df):

    df['LDA_res'] = ''
    df['LDA_loc'] = ''

    for idx , row in df.iterrows():
        if idx % 1000 ==0 :
            print(f'Now progress... {idx}')
        res = all_df[all_df.business_id == row.business_id].sort_values(by='date',ascending=False)
        loc = all_df[all_df.postal_code == row.postal_code].sort_values(by='date',ascending=False)
        
        try:
            res_str = ''.join(list(res.text))[:3000]
            loc_str = ''.join(list(res.text))[:3000]

            corpus_res = [dictionary.doc2bow(doc) for doc in [preprocess(res_str)]]
            corpus_loc = [dictionary.doc2bow(doc) for doc in [preprocess(loc_str)]]

            df['LDA_res'][idx] = lda_model[corpus_res][0]
            df['LDA_loc'][idx] = lda_model[corpus_loc][0]
            
        except:
            df['LDA_res'][idx] = []
            df['LDA_loc'][idx] = []
        
    
    return df


In [None]:
train_df = get_LDA_aspects(train_df , all_df)
test_df = get_LDA_aspects(test_df , all_df)

In [None]:
train_df.to_pickle('../data/Train_by_postoal_code_pointwise_v3_3.pkl')
test_df.to_pickle('../data/Test_by_postoal_code_pointwise_v3_3.pkl')

In [None]:
train_df = pd.read_pickle('../data/Train_by_postoal_code_pointwise_v3_3.pkl')
test_df = pd.read_pickle('../data/Test_by_postoal_code_pointwise_v3_3.pkl')

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased',output_hidden_states = True)
model.eval()


def get_bert_embedding(df , all_df):
    
    df['res_emb'] = ''
    df['loc_emb'] = ''
    
    for idx , row in df.iterrows():
        if idx % 1000 ==0 :
            print(f'Now progress... {idx}')
        res = all_df[all_df.business_id == row.business_id].sort_values(by='date',ascending=False)
        loc = all_df[all_df.postal_code == row.postal_code].sort_values(by='date',ascending=False)
        
        res_embedding = []
        loc_embedding = []
        
        try:

            for _idx , _row in res.iterrows():
                encoded_input = tokenizer(_row.text, max_length = 128 , padding = True , truncation = True,  return_tensors='pt')
                with torch.no_grad():
                    outputs = model(**encoded_input)
                res_embedding.append(outputs.hidden_states[-1][0,0,:])
            df['res_emb'][idx] = torch.mean(torch.stack(res_embedding), dim=0)
        except:
            df['res_emb'][idx] = torch.zeros([1, 768], dtype=torch.int32)

        try:
            for _idx , _row in loc.iterrows():
                encoded_input = tokenizer(_row.text, max_length = 128 , padding = True , truncation = True,  return_tensors='pt')
                with torch.no_grad():
                    outputs = model(**encoded_input)
                loc_embedding.append(outputs.hidden_states[-1][0,0,:])
            df['loc_emb'][idx] =torch.mean(torch.stack(loc_embedding), dim=0)
        except:
            df['loc_emb'][idx] = torch.zeros([1, 768], dtype=torch.int32)

    
    return df

In [None]:
train_df = get_bert_embedding(train_df , all_df)
test_df = get_bert_embedding(test_df , all_df)

In [None]:
train_df.shape , test_df.shape

In [None]:
train_df.to_pickle('../data/Train_by_postoal_code_pointwise_v3_3.pkl')
test_df.to_pickle('../data/Test_by_postoal_code_pointwise_v3_3.pkl')