In [None]:
import pickle
import pandas as pd
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import itertools
from collections import Counter
import numpy as np
from nltk import word_tokenize
from nltk.corpus import stopwords
from gensim.models import word2vec
from sklearn.linear_model import LogisticRegression
import os
import string

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
# A function used to build a vocabulary based on descending word frequencies
def build_vocab(sentences):
    # Build vocabulary
    word_counts = Counter(itertools.chain(*sentences))
    # Mapping from index to word
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    return word_counts, vocabulary, vocabulary_inv

In [None]:
# A function used to learn word embeddings through Word2vec module
def get_embeddings(inp_data, vocabulary_inv, size_features=100,
                   mode='skipgram',
                   min_word_count=2,
                   context=5):
    model_name = "embedding"
    model_name = os.path.join(model_name)
    num_workers = 15  # Number of threads to run in parallel
    downsampling = 1e-3  # Downsample setting for frequent words
    print('Training Word2Vec model...')
    # use inp_data and vocabulary_inv to reconstruct sentences
    sentences = [[vocabulary_inv[w] for w in s] for s in inp_data]
    if mode == 'skipgram':
        sg = 1
        print('Model: skip-gram')
    elif mode == 'cbow':
        sg = 0
        print('Model: CBOW')
    embedding_model = word2vec.Word2Vec(sentences, workers=num_workers,
                                        sg=sg,
                                        vector_size=size_features,
                                        min_count=min_word_count,
                                        window=context,
                                        sample=downsampling)
    print("Saving Word2Vec model {}".format(model_name))
    embedding_weights = np.zeros((len(vocabulary_inv), size_features))
    for i in range(len(vocabulary_inv)):
        word = vocabulary_inv[i]
        if word in embedding_model.wv:
            embedding_weights[i] = embedding_model.wv[word]
        else:
            embedding_weights[i] = np.random.uniform(-0.25, 0.25,
                                                     embedding_model.vector_size)
    return embedding_weights

In [None]:
from nltk.stem import PorterStemmer 
ps = PorterStemmer() 
def preprocess_df(df, stemming=False):
    # get English stopwords
    stop_words = set(stopwords.words('english'))
    stop_words.add('would')
    stop_words.add('The')
    # prepare translation table to translate punctuation to space
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    preprocessed_sentences = []
    for i, row in df.iterrows():
        sent = row["text"]
        sent_nopuncts = sent.translate(translator)
        words_list = sent_nopuncts.strip().split()
        if stemming == True:
            words_list = [ps.stem(word) for word in words_list]
        filtered_words = [word for word in words_list if word not in stop_words and len(word) != 1] # also skip space from above translation
        preprocessed_sentences.append(" ".join(filtered_words))
    df["text"] = preprocessed_sentences
    return df

In [26]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
nan_counts = train.isna().sum()
cols_drop = nan_counts[nan_counts > 10000].index.tolist()

In [27]:
import numpy as np
import re
import ast

def parse_dict_string(x):
    if pd.isna(x):
        return {}
    else:
        # Convert string to dictionary using ast.literal_eval()
        return ast.literal_eval(x[2:-1])

def expand_ambience_features(x, key):
    if pd.isna(x):
        return np.nan
    else:
        return 1 if key in x and x[key] else 0
    

def encode_dict(df, col):
    df[col] = df[col].apply(parse_dict_string)

    # Extract keys from dictionaries
    keys = set()
    for d in df[col]:
        if not pd.isna(d):
            keys.update(d.keys())

    # Expand dictionary features into binary features
    for key in keys:
        df[key] = df[col].apply(lambda x: expand_ambience_features(x, key))
    df.drop(columns=[col], inplace=True)
    return df

In [28]:
dict_cols = ['attributes.Ambience', 'attributes.BusinessParking', 'attributes.GoodForMeal']

# Attributes with regex
attr = ['hours.Tuesday', 'postal_code', 'attributes.OutdoorSeating', 'hours.Saturday', 'name', 'attributes.BusinessAcceptsCreditCards',
        'attributes.RestaurantsReservations', 'hours.Friday', 'attributes.RestaurantsPriceRange2', 'attributes.WiFi', 'attributes.NoiseLevel',
        'state', 'attributes.Alcohol', 'attributes.HasTV', 'hours.Wednesday', 'hours.Sunday', 'attributes.RestaurantsGoodForGroups',
        'attributes.Caters', 'attributes.BikeParking', 'hours.Monday', 'city', 'attributes.RestaurantsTakeOut', 'hours.Thursday', 'attributes.RestaurantsAttire',
        'attributes.RestaurantsDelivery', 'attributes.GoodForKids']

In [29]:
def preprocess(df):
    df.drop(columns = cols_drop, inplace=True)
    for col in dict_cols:
        df = encode_dict(df, col)
    df.drop(columns=['business_id'], inplace=True)

    col_names = df.columns
    new_column_names = {column: column.replace('attributes.', '') for column in col_names}
    df = df.rename(columns=new_column_names)

    d = {'False': 0, 'True': 1}
    pattern = r'\b\d+\b(?:\s+\b\d+\b)*\s*'
    #clean b'
    for col in df:
        if col != 'text' and col != 'label' and col != 'name' and col != 'latitude' and col != 'longitude' and col != 'postal_code' and col != 'is_open' and col != 'review':
            try:
                df[col] = df[col].str.extract(r"b'(.*?)'")
                df[col] = df[col].map(d)
            except:
                pass
        if col == 'name' or col == 'postal_code':
            df[col] = df[col].str.extract(r"b'(.*?)'")
        ## 0 if Canada postal code 1 if US
        if col == 'postal_code':
            df[col] = df[col].str.match(pattern)
    df = df.fillna(0)
    return df
    
train = preprocess(train)
test = preprocess(test)

#### Train-test split

In [30]:
from sklearn.model_selection import train_test_split
from math import log
from gensim.models import TfidfModel
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

y = train['label']
X = train.drop(columns=['label', 'name', 'RestaurantsPriceRange2'])
X["text"] = X["review"]
X = preprocess_df(X, stemming=False)
X.drop(columns=['review'], inplace=True)

In [31]:
from sklearn.preprocessing import LabelEncoder

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=42, shuffle=True)
X_train_xgb, X_val_xgb = X_train.drop(columns=['text']), X_val.drop(columns=['text'])

In [36]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Train an XGBoost classifier
xgb_classifier = xgb.XGBClassifier()
xgb_classifier.fit(X_train_xgb, y_train)
y_pred = xgb_classifier.predict(X_val_xgb)
accuracy = accuracy_score(y_val, y_pred)
print("Accuracy without selected features:", accuracy)

# Get feature importances
feature_importances = xgb_classifier.feature_importances_

# Create a DataFrame with feature importances
importance_df = pd.DataFrame({'Feature': X_train_xgb.columns, 'Importance': feature_importances})

# Sort features by importance
importance_df = importance_df.sort_values(by='Importance', ascending=False)

n_feature = {}

for num in range(20,50,2):
    top_features = importance_df.head(num)['Feature'].tolist()

    # Select top features from the original DataFrame
    X_train_xgb_selected = X_train_xgb[top_features]
    X_test_selected = X_val_xgb[top_features]

    # Train XGBoost classifier with selected features
    xgb_classifier_selected = xgb.XGBClassifier()
    xgb_classifier_selected.fit(X_train_xgb_selected, y_train)

    # Evaluate the model
    y_pred = xgb_classifier_selected.predict(X_test_selected)
    accuracy = accuracy_score(y_val, y_pred)
    n_feature[num] = roc_auc_score(y_val, xgb_classifier_selected.predict_proba(X_test_selected), multi_class='ovr')

best_num = max(n_feature, key=lambda k: n_feature[k]) 
best_num

Accuracy without selected features: 0.3921643210346139


32

Best number of features: 32

### NLP Review

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=True,
                        preprocessor=None,  # applied preprocessor in Data Cleaning
                        tokenizer=word_tokenize,
                        use_idf=True,
                        norm='l2',
                        smooth_idf=True,
                        stop_words= 'english',
                        max_df=0.4,
                        sublinear_tf=True)