In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import string
import unicodedata

from scipy import sparse
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.util import ngrams
from nltk import pos_tag
from nltk import RegexpParser
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

import pickle

In [7]:
def filter_tokens(sent):
    stopwords_ = set(stopwords.words('english'))
    punctuation_ = set(string.punctuation)

    return([w for w in sent if not w in stopwords_ and not w in punctuation_])

In [8]:
def validate_string(s):
    letter_flag = False
    number_flag = False
    for i in s:
        if i.isalpha():
            letter_flag = True
        if i.isdigit():
            number_flag = True
    return letter_flag and number_flag

In [9]:
def extract_bow_from_column(column):
    numeric = '0123456789.'
    stopwords_ = set(stopwords.words('english'))
    punctuation_ = set(string.punctuation)
    special_char = ['%', '-', '/']
    filtered_column = []
    for input_string in column:
        tokens_list = [sent for sent in map(word_tokenize, sent_tokenize(str(input_string)))]
        tokens_filtered = list(map(filter_tokens, tokens_list))
        # filter it more
        for tokens in tokens_filtered:
            for idx, token in enumerate(tokens):
                if token.isdigit():
                    tokens.pop(idx)
                for char in special_char:
                    if char in token:
                        tokens.pop(idx)
                        tokens.extend(token.split(char))
        for tokens in tokens_filtered:   
            for idx, token in enumerate(tokens):
                if validate_string(token):
        #             both number and letter in it
                    for i, c in enumerate(token):
                        if c not in numeric:
                            break
                    number = token[:i]
                    unit = token[i:]
                    tokens.pop(idx)
                    tokens.append(unit)
        # either porter or snowball work for stemming 
        stemmer_porter = PorterStemmer()
        tokens_stemporter = [list(map(stemmer_porter.stem, sent)) for sent in tokens_filtered]
        if len(tokens_stemporter) == 0:
            filtered_column.append('')
        else:
            filtered_column.append(' '.join(tokens_stemporter[0]))
    return filtered_column

In [14]:
amazon_df = pd.read_csv('../data/clean_amazon_reviews.csv')

In [15]:
amazon_df['pros'] = amazon_df['pros'].str.lower()
amazon_df['filtered_pros'] = extract_bow_from_column(amazon_df['pros'])

In [19]:
corpus_pros = [row for row in amazon_df['filtered_pros']]

In [27]:
corpus_pros

['work hard fun make histori',
 'realli smart peopl lot opportun growth alway encourag innov think big creat someth new',
 "jeff bezo `` '' brilliant continu make great decis growth s team long term",
 "amazon lot 's cool stuff ... lot bore stuff",
 'compani get list perform c list employe',
 'great start pay abil get rais quickli',
 'super smart peopl best best school get hire hirabl anywher recruit process tough pay averag probabl 1.5 time elsewher expect result elsewher lot opportun work new innov project cool slu campu lot option food drink work',
 'smartest peopl ’ ever work',
 'weekli pay great pay flexibl shift',
 'compani work like well oil machin manag work hard good job pay great job',
 'decent pay',
 'great peopl room show skill high level',
 'pay work life balanc employe moral employe attitud good manag',
 'competit pay great benefit resourc',
 'pay respons ratio good',
 'whole peopl smart ton flexibl move around tri new thing earn trust team manag promot possibl far guaran

In [55]:
cv = CountVectorizer(max_features=2000)
cv_array = cv.fit_transform(corpus_pros).toarray()
cv_dict = {}
i = 0
for key in cv.vocabulary_:
    cv_dict["word_" + key] = cv_array[:,i]
    i += 1
cv_df = pd.DataFrame(cv_dict)

In [68]:
non_nlp_df = amazon_df[['culture-values-stars', 'career-opportunities-stars', 
                       'comp-benefit-stars', 'senior-management-stars', 'helpful-count',
                       'is_current_employee', 'year', 'quarter', 'amazon_earnings_this_quarter']]

X = pd.concat([non_nlp_df, cv_df], axis=1)
y = amazon_df['work-balance-stars'].values

In [71]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [None]:
gbr = GradientBoostingRegressor(learning_rate=0.005,
                                   n_estimators=1000,
                                   min_samples_leaf=5,
                                   max_depth=3,
                                   subsample=0.5)
gbr.fit(X_train, y_train)


In [47]:
with open('gradient_boosting_regressor.pkl', 'wb') as f:
    # Write the model to a file.
    pickle.dump(gbr, f)


In [39]:
y_pred = gbr.predict(X_test)

In [42]:
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
print(rmse)

1.2021331443447323


In [None]:
X.to_csv("../data/df_with_nlp.csv")

In [None]:

# with open('model.pkl', 'rb') as f:
#     model = pickle.load(f)
