In [13]:
# general libraries
import pandas as pd
import numpy as np
import string
from math import log
from collections import Counter, defaultdict
import itertools
import ast
import os
import pickle

# feature engineering libraries
import en_core_web_sm
import nltk
import spacy
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

# model building & evaluating libraries
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import roc_auc_score, f1_score, classification_report
from gensim.models import word2vec
from sklearn.neural_network import MLPClassifier

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import SGD, Adam, Adadelta, RMSprop
import keras.backend as K

from gensim.models import Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors

In [14]:
def preprocess_df(df):
    # get English stopwords
    en = spacy.load('en_core_web_sm')
    sw_spacy = en.Defaults.stop_words
    stop_words = set(sw_spacy)
    # stop_words = set(stopwords.words('english'))
    # stop_words.add('would')
    translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
    preprocessed_sentences = []
    for i, row in df.iterrows():
        sent = row["text"].lower()
        sent = sent.replace('\\n',' ')
        sent_nopuncts = sent.translate(translator)
        words_list = sent_nopuncts.strip().split()
        filtered_words = [word for word in words_list if word not in stop_words and len(word) != 1] # also skip space from above translation
        preprocessed_sentences.append(" ".join(filtered_words))
    df["text"] = preprocessed_sentences
    return df

In [15]:
def build_vocab(sentences):
    word_counts = Counter(itertools.chain(*sentences))
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
    return word_counts, vocabulary, vocabulary_inv

In [4]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

df_train["text"] = df_train['name'] + ' ' + df_train["review"]
df_test["text"] = df_test['name'] + ' ' + df_test["review"]

df_train = preprocess_df(df_train)
df_test = preprocess_df(df_test)

In [16]:
df_all = df_train[['text']]

In [17]:
tagged_train_data = [word_tokenize(_d) for i, _d in enumerate(df_train["text"])]
tagged_test_data = [word_tokenize(_d) for i, _d in enumerate(df_test["text"])]

In [18]:
tagged_data = [word_tokenize(_d) for i, _d in enumerate(df_all["text"])]
word_counts, vocabulary, vocabulary_inv = build_vocab(tagged_data)
inp_data = [[vocabulary[word] for word in text] for text in tagged_data]

In [8]:
# GloVE
# A function used to learn word embeddings through Word2vec module
def get_embeddings_g(inp_data, vocabulary_inv):
    glove_file = 'glove.6B.300d.txt'
    tmp_file = get_tmpfile("test_word2vec.txt")
    _ = glove2word2vec(glove_file, tmp_file)
    embedding_model = KeyedVectors.load_word2vec_format(tmp_file)
    embedding_weights = np.zeros((len(vocabulary_inv), 300))
    for i in range(len(vocabulary_inv)):
        word = vocabulary_inv[i]
        if word in embedding_model:
            embedding_weights[i] = embedding_model[word]
        else:
            embedding_weights[i] = np.random.uniform(-0.25, 0.25, embedding_model.vector_size)
    return embedding_weights

embedding_weights_g = get_embeddings_g(inp_data, vocabulary_inv)

  _ = glove2word2vec(glove_file, tmp_file)


In [9]:
# GloVe
train_vec = []
for doc in tagged_train_data:
    vec = 0
    for w in doc:
        vec += embedding_weights_g[vocabulary[w]]
    vec = vec / len(doc)
    train_vec.append(vec)
    
test_vec = []
for doc in tagged_test_data:
    vec = 0
    length = 0
    for w in doc:
        try:
            vec += embedding_weights_g[vocabulary[w]]
            length += 1
        except:
            continue
    vec = vec / length
    test_vec.append(vec)
    
param_grid = [
    {
        'C' : np.linspace(1, 101, 10, endpoint=False)
    }
]

# clf_g = GridSearchCV(LogisticRegression(max_iter=100000000), param_grid = param_grid, cv = 5, n_jobs=-1, scoring = 'f1_weighted').fit(train_vec, df_train["label"].values.ravel())
# print(clf_g.best_params_)
# print(clf_g.best_score_)

clf_g = LogisticRegression(max_iter=100000000, C = 21).fit(train_vec, df_train["label"].values.ravel())

In [10]:
pred_g = clf_g.predict(test_vec)
prob_g = clf_g.predict_proba(test_vec)

In [11]:
dic = {"Id": [], "Predicted": []}
for i, pred in enumerate(pred_g):
    dic["Id"].append(i)
    dic["Predicted"].append(pred)
df_g = pd.DataFrame.from_dict(dic)
df_g.to_csv('predicted_g.csv', index = False)

In [19]:
# Word2Vec

def get_embeddings_w(inp_data, vocabulary_inv, size_features=100,
                   mode='skipgram',
                   min_word_count=2,
                   context=10):
    model_name = "embedding"
    model_name = os.path.join(model_name)
    num_workers = 15
    downsampling = 1e-3
    print('Training Word2Vec model...')
    sentences = [[vocabulary_inv[w] for w in s] for s in inp_data]
    if mode == 'skipgram':
        sg = 1
        print('Model: skip-gram')
    elif mode == 'cbow':
        sg = 0
        print('Model: CBOW')
    embedding_model = word2vec.Word2Vec(sentences, workers=num_workers,
                                        sg=sg,
                                        vector_size=size_features,
                                        min_count=min_word_count,
                                        window=context,
                                        sample=downsampling)
    print("Saving Word2Vec model {}".format(model_name))
    embedding_weights = np.zeros((len(vocabulary_inv), size_features))
    for i in range(len(vocabulary_inv)):
        word = vocabulary_inv[i]
        if word in embedding_model.wv:
            embedding_weights[i] = embedding_model.wv[word]
        else:
            embedding_weights[i] = np.random.uniform(-0.25, 0.25,
                                                     embedding_model.vector_size)
    return embedding_weights

In [18]:
# Word2Vec - sg

for i in range(15,21): # context window
    for j in range(1,4): # min word count
        
        embedding_weights_w = get_embeddings_w(inp_data, vocabulary_inv, min_word_count = j, context = i)

        train_vec = []
        for doc in tagged_train_data:
            vec = 0
            for w in doc:
                vec += embedding_weights_w[vocabulary[w]]
            vec = vec / len(doc)
            train_vec.append(vec)

        test_vec = []
        for doc in tagged_test_data:
            vec = 0
            length = 0
            for w in doc:
                try:
                    vec += embedding_weights_w[vocabulary[w]]
                    length += 1
                except:
                    continue
            vec = vec / length
            test_vec.append(vec)

        param_grid = [
            {
                'C' : np.linspace(1, 101, 10, endpoint=False)
            }
        ]

        clf_w = GridSearchCV(LogisticRegression(max_iter=100000000), param_grid = param_grid, cv = 5, n_jobs=-1, scoring = 'f1_weighted').fit(train_vec, df_train["label"].values.ravel())
        print(i, j, clf_w.best_params_, clf_w.best_score_)

# clf_w = LogisticRegression(max_iter=100000000, C = 21).fit(train_vec, df_train["label"].values.ravel())

Training Word2Vec model...
Model: skip-gram
Saving Word2Vec model embedding
15 1 {'C': 61.0} 0.809245233368571
Training Word2Vec model...
Model: skip-gram
Saving Word2Vec model embedding
15 2 {'C': 51.0} 0.809737415872515
Training Word2Vec model...
Model: skip-gram
Saving Word2Vec model embedding
15 3 {'C': 21.0} 0.8096562457063345
Training Word2Vec model...
Model: skip-gram
Saving Word2Vec model embedding
16 1 {'C': 51.0} 0.8106870319515561
Training Word2Vec model...
Model: skip-gram
Saving Word2Vec model embedding
16 2 {'C': 31.0} 0.8112296457142367
Training Word2Vec model...
Model: skip-gram
Saving Word2Vec model embedding
16 3 {'C': 31.0} 0.812320346728808
Training Word2Vec model...
Model: skip-gram
Saving Word2Vec model embedding
17 1 {'C': 31.0} 0.8121444701548975
Training Word2Vec model...
Model: skip-gram
Saving Word2Vec model embedding
17 2 {'C': 41.0} 0.8113741901675651
Training Word2Vec model...
Model: skip-gram
Saving Word2Vec model embedding
17 3 {'C': 41.0} 0.812071867834



20 3 {'C': 31.0} 0.8134123597494236


In [20]:
# embedding_weights_w = get_embeddings_w(inp_data, vocabulary_inv, min_word_count = 2, context = 15) # 1st GSCV best
embedding_weights_w = get_embeddings_w(inp_data, vocabulary_inv, min_word_count = 3, context = 20) # 2nd GSCV best

train_vec = []
for doc in tagged_train_data:
    vec = 0
    for w in doc:
        vec += embedding_weights_w[vocabulary[w]]
    vec = vec / len(doc)
    train_vec.append(vec)

test_vec = []
for doc in tagged_test_data:
    vec = 0
    length = 0
    for w in doc:
        try:
            vec += embedding_weights_w[vocabulary[w]]
            length += 1
        except:
            continue
    vec = vec / length
    test_vec.append(vec)

clf_w = LogisticRegression(max_iter=100000000, C = 31).fit(train_vec, df_train["label"].values.ravel())

pred_w = clf_w.predict(test_vec)
# prob_w = clf_w.predict_proba(test_vec)

Training Word2Vec model...
Model: skip-gram
Saving Word2Vec model embedding


In [21]:
dic = {"Id": [], "Predicted": []}
for i, pred in enumerate(pred_w):
    dic["Id"].append(i)
    dic["Predicted"].append(pred)
df_w = pd.DataFrame.from_dict(dic)
df_w.to_csv('predicted_w1.csv', index = False)

In [28]:
pd.DataFrame(prob_w).to_csv('best_w2v_probs.csv', index = False)

In [29]:
pd.DataFrame(prob_g).to_csv('best_glv_probs.csv', index = False)