In [1]:
import itertools
import os
import pickle
import string
from collections import Counter

import numpy as np
import pandas as pd
from gensim.models import word2vec
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [2]:
import warnings

warnings.filterwarnings("ignore")

In [3]:
def build_vocab(sentences):
    # Build vocabulary
    word_counts = Counter(itertools.chain(*sentences))
    # Mapping from index to word
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    # Mapping from word to index
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    return word_counts, vocabulary, vocabulary_inv

In [4]:
def get_embeddings(
    inp_data,
    vocabulary_inv,
    size_features=100,
    mode="skipgram",
    min_word_count=4,
    context=5,
):
    model_name = "embedding"
    model_name = os.path.join(model_name)
    num_workers = 25  # Number of threads to run in parallel
    downsampling = 1e-3  # Downsample setting for frequent words
    print("Training Word2Vec model...")
    sentences = [[vocabulary_inv[w] for w in s] for s in inp_data]
    if mode == "skipgram":
        sg = 1
        print("Model: skip-gram")
    elif mode == "cbow":
        sg = 0
        print("Model: CBOW")
    embedding_model = word2vec.Word2Vec(
        sentences,
        workers=num_workers,
        sg=sg,
        size=size_features,
        min_count=min_word_count,
        window=context,
        alpha=0.03,
        min_alpha=0.0007,
        sample=downsampling,
    )
    embedding_model.init_sims(replace=True)
    print("Saving Word2Vec model {}".format(model_name))
    embedding_weights = np.zeros((len(vocabulary_inv), size_features))
    for i in range(len(vocabulary_inv)):
        word = vocabulary_inv[i]
        if word in embedding_model:
            embedding_weights[i] = embedding_model[word]
        else:
            embedding_weights[i] = np.random.uniform(
                -0.25, 0.25, embedding_model.vector_size
            )
    return embedding_weights

In [5]:
def preprocess_df(df):
    stop_words = set(stopwords.words("english"))
    stop_words.add("would")
    translator = str.maketrans(string.punctuation, " " * len(string.punctuation))
    preprocessed_sentences = []
    for i, row in df.iterrows():
        sent = row["text"]
        sent_nopuncts = sent.translate(translator)
        words_list = sent_nopuncts.strip().split()
        filtered_words = [
            word for word in words_list if word not in stop_words and len(word) != 1
        ]
        preprocessed_sentences.append(" ".join(filtered_words))
    df = df.assign(text=preprocessed_sentences)
    return df

In [6]:
data_path = "./"
df_train = pd.read_csv(data_path + "train.csv")
df_test = pd.read_csv(data_path + "test.csv")

In [7]:
df_train = df_train.fillna("")
df_test = df_test.fillna("")

In [8]:
df_train["attributes.ByAppointmentOnly"] = df_train[
    "attributes.ByAppointmentOnly"
].replace({"b'False'": " ", "b'True'": "yes_appointment", "b'None'": " "})

df_test["attributes.ByAppointmentOnly"] = df_test[
    "attributes.ByAppointmentOnly"
].replace({"b'False'": " ", "b'True'": "yes_appointment", "b'None'": " "})

In [9]:
df_train["attributes.RestaurantsDelivery"] = df_train[
    "attributes.RestaurantsDelivery"
].replace({"b'False'": " ", "b'True'": "yes_delivery", "b'None'": " "})

df_test["attributes.RestaurantsDelivery"] = df_test[
    "attributes.RestaurantsDelivery"
].replace({"b'False'": " ", "b'True'": "yes_delivery", "b'None'": " "})

In [10]:
df_train["attributes.HappyHour"] = df_train[
    "attributes.HappyHour"
].replace({"b'False'": " ", "b'True'": "happy_hour", "b'None'": " "})

df_test["attributes.HappyHour"] = df_test[
    "attributes.HappyHour"
].replace({"b'False'": " ", "b'True'": "happy_hour", "b'None'": " "})

In [11]:
df_train["attributes.GoodForDancing"] = df_train[
    "attributes.GoodForDancing"
].replace({"b'False'": " ", "b'True'": "dancing"})

df_test["attributes.GoodForDancing"] = df_test[
    "attributes.GoodForDancing"
].replace({"b'False'": " ", "b'True'": "dancing"})

In [12]:
df_train = df_train.assign(
    # text = df_train["review"] + df_train['city']
    text=df_train["name"]
    + df_train["city"]
    + df_train["attributes.ByAppointmentOnly"]
    + df_train["attributes.RestaurantsDelivery"] 
    + df_train["attributes.HappyHour"] + df_train["attributes.GoodForDancing"]
    + df_train["attributes.NoiseLevel"]
    + df_train["review"]
)

df_test = df_test.assign(
    # text = df_test['review'] + df_test['city']
    text=df_test["name"]
    + df_test["city"]
#     + df_test["attributes.ByAppointmentOnly"]
    + df_test["attributes.RestaurantsDelivery"] 
    + df_test["attributes.HappyHour"] + df_test["attributes.GoodForDancing"]
    + df_test["attributes.NoiseLevel"]
    + df_test["review"]
)

In [13]:
df_train = df_train[["label", "text"]]
df_test = df_test[["text"]]

In [14]:
# # W2V model
# df_train = preprocess_df(df_train)
# df_test = preprocess_df(df_test)

# tagged_data = [word_tokenize(_d) for i, _d in enumerate(df_train["text"])]
# word_counts, vocabulary, vocabulary_inv = build_vocab(tagged_data)
# inp_data = [[vocabulary[word] for word in text] for text in tagged_data]
# embedding_weights = get_embeddings(inp_data, vocabulary_inv)


# tagged_train_data = [word_tokenize(_d) for i, _d in enumerate(df_train["text"])]
# tagged_test_data = [word_tokenize(_d) for i, _d in enumerate(df_test["text"])]

# train_vec = []
# for doc in tagged_train_data:
#     vec = 0
#     for w in doc:
#         vec += embedding_weights[vocabulary[w]]
#     vec = vec / len(doc)
#     train_vec.append(vec)

# test_vec = []
# for doc in tagged_test_data:
#     vec = 0
#     length = 0
#     for w in doc:
#         try:
#             vec += embedding_weights[vocabulary[w]]
#             length += 1
#         except:
#             continue
#     vec = vec / length
#     test_vec.append(vec)

# clf = LogisticRegression(max_iter=100000000).fit(train_vec, df_train["label"])
# preds = clf.predict(test_vec)

In [15]:
# # This def takes in the column with the processed text from the original headlines for the training and testing sets
# # returns two bag of words-- one for the training set and one for the testing set
# def create_bag(df_col, test_df):
#     # create a corpus and a generalized countvectorizer
#     corpus = " ".join(list(df_col))
#     count = CountVectorizer()

#     # treat the training and testing sets separately since they are 2 different dataframes
#     docs = df_col.values.tolist()
#     docs_test = test_df.values.tolist()

#     # get two bags, but use the overall docs count
#     bag = count.fit_transform(docs)
#     test_bag = count.transform(docs_test)

#     return bag, test_bag

# # create the training and testing bags
# train_bag, test_bag = create_bag(df_train["text"], df_test["text"])

In [16]:
# import nltk
# nltk.download('punkt')

In [17]:
from nltk.stem import PorterStemmer 
ps = PorterStemmer()
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))

# return a list of tokens
def pre_processing_by_nltk(doc, stemming = True, need_sent = False):
    # step 1: get sentences
    sentences = sent_tokenize(doc)
    # step 2: get tokens
    tokens = []
    for sent in sentences:
        words = word_tokenize(sent)
        # step 3 (optional): stemming
        if stemming:
            words = [ps.stem(word) for word in words]
        if need_sent:
            tokens.append(words)
        else:
            tokens += words
    return [w.lower() for w in tokens if w not in stop]

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

#TF-IDF approach
# takes in 2 parameters as inputs and returns their respective tf-idf arrays
def tf_idf(train_bag, test_bag):
    # use the 3rd party library for tf-idf
    tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=True,
                        preprocessor=None,  # applied preprocessor in Data Cleaning
                        use_idf=True,
                        tokenizer=pre_processing_by_nltk,
                        min_df = 2,
                        max_features = 60000,
                        norm='l2',
                        smooth_idf=True)

    # fit transform the tf-idf to training and testing bags input above 
    train_idf = (tfidf.fit_transform(train_bag)).toarray()
    test_idf = (tfidf.transform(test_bag)).toarray()

    return train_idf, test_idf

# training and testing arrays
train_idf, test_idf = tf_idf(df_train['text'], df_test['text'])

#small_df = df.head(10000)
#y = small_df.sentiment.values
#X = tfidf.fit_transform(small_df.review)

#print('bag-of-words features ready!')


In [19]:
def to_df():
    df_train["TF-IDF"] = [train_idf[x] for x in range(len(df_train))]

    df_test["TF-IDF"] = [test_idf[x] for x in range(len(df_test))]
    
to_df()#add it back to df as a new column

In [20]:
train_type = df_train['label']
train_idfs = df_train["TF-IDF"]
test_idfs = df_test["TF-IDF"]

In [21]:
len(df_train['TF-IDF'][6])

33113

In [22]:
len(df_test['TF-IDF'][6])

33113

In [23]:
train_idfs = np.matrix(train_idfs.tolist())

In [24]:
test_idfs = np.matrix(test_idfs.tolist())

In [25]:
# c) model for tf-idf
idf_reg = LogisticRegressionCV(random_state = 0, cv = 2, max_iter = 500).fit(
    train_idfs, train_type
)

In [26]:
idf_pred = idf_reg.predict(test_idfs)

In [27]:
# create new predicted file
dic = {"Id": [], "Predicted": []}
for i, pred in enumerate(idf_pred):
    dic["Id"].append(i)
    dic["Predicted"].append(pred)

dic_df = pd.DataFrame.from_dict(dic)
dic_df.to_csv(data_path + "predicted_idf.csv", index=False)

In [28]:
# # create new predicted file
# dic = {"Id": [], "Predicted": []}
# for i, pred in enumerate(preds):
#     dic["Id"].append(i)
#     dic["Predicted"].append(pred)

# dic_df = pd.DataFrame.from_dict(dic)
# dic_df.to_csv(data_path + "predicted.csv", index=False)

In [29]:
# # create new predicted file
# dic = {"Id": [], "Predicted": []}
# for i, pred in enumerate(preds):
#     dic["Id"].append(i)
#     dic["Predicted"].append(pred)

# dic_df = pd.DataFrame.from_dict(dic)
# dic_df.to_csv(data_path + "predicted_2.csv", index=False)

In [30]:
# pred_1 = pd.read_csv('predicted.csv')
# pred_2 = pd.read_csv('predicted_2.csv')

# comp_pred = pred_1.merge(pred_2, on = 'Id', suffixes=('_old', '_new'), how = 'left')
# comp_pred.to_csv(data_path + 'comp.csv', index = False)

In [31]:
# sum(comp_pred['Predicted_old'] == comp_pred['Predicted_new'])

In [32]:
# # create new predicted file
# dic = {"Id": [], "Predicted": []}
# for i, pred in enumerate(preds):
#     dic["Id"].append(i)
#     dic["Predicted"].append(pred)

# dic_df_3 = pd.DataFrame.from_dict(dic)
# comp_pred_2 = comp_pred.merge(dic_df_3, on = 'Id', how = 'left')
# comp_pred_2.to_csv(data_path + 'comp_2.csv', index = False)
# #dic_df.to_csv(data_path + "predicted_2.csv", index=False)

In [33]:
# dic_df_3.to_csv(data_path + "predicted_3.csv", index=False)

In [34]:
#sum(comp_pred['Predicted_old'] == comp_pred_2['Predicted'])

In [35]:
#sum(comp_pred['Predicted_new'] == comp_pred_2['Predicted'])

In [36]:
#predicted_old: 0.73860 (with hyperparameter changes)
# predicted_new (min_word increased by 1): 0.72510 -> 0.73420
# preditect_3 (with happy_hour): 0.73420 -> 0.73770
#predicted (added variables of noise and dancing): 0.73770 -> 0.73860