In [15]:
# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *
import pandas as pd
import re
import nltk
import json
import numpy as np
from nltk import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from string import punctuation
# conda install textblob -c conda-forge
from textblob import TextBlob
import contractions 
# import sys  
# !{sys.executable} -m pip install contractions
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import NMF
from sklearn.pipeline import make_pipeline

In [4]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/aliceye/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aliceye/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/aliceye/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Loading data and creating columns

In [5]:
train_json_data = json.load(open('train.json'))
train_df = pd.json_normalize(train_json_data)

test_json_data = json.load(open('test.json'))
test_df = pd.json_normalize(test_json_data)

In [6]:
test_df.columns

Index(['giver_username_if_known', 'request_id', 'request_text_edit_aware',
       'request_title', 'requester_account_age_in_days_at_request',
       'requester_days_since_first_post_on_raop_at_request',
       'requester_number_of_comments_at_request',
       'requester_number_of_comments_in_raop_at_request',
       'requester_number_of_posts_at_request',
       'requester_number_of_posts_on_raop_at_request',
       'requester_number_of_subreddits_at_request',
       'requester_subreddits_at_request',
       'requester_upvotes_minus_downvotes_at_request',
       'requester_upvotes_plus_downvotes_at_request', 'requester_username',
       'unix_timestamp_of_request', 'unix_timestamp_of_request_utc'],
      dtype='object')

In [7]:
available_cols = ['giver_username_if_known', 'request_id', 'request_text_edit_aware',
       'request_title', 'requester_account_age_in_days_at_request',
       'requester_days_since_first_post_on_raop_at_request',
       'requester_number_of_comments_at_request',
       'requester_number_of_comments_in_raop_at_request',
       'requester_number_of_posts_at_request',
       'requester_number_of_posts_on_raop_at_request',
       'requester_number_of_subreddits_at_request',
       'requester_subreddits_at_request',
       'requester_upvotes_minus_downvotes_at_request',
       'requester_upvotes_plus_downvotes_at_request', 'requester_username',
       'unix_timestamp_of_request', 'unix_timestamp_of_request_utc', 'requester_received_pizza']
short_train_df = train_df[available_cols]
short_train_df.columns

# create new df for if someone did not get a pizza
no_pizza_df = short_train_df[short_train_df.requester_received_pizza == False]
# create new df for if someone did get a pizza
yes_pizza_df = short_train_df[short_train_df.requester_received_pizza == True]

# Creating stopword list

Added a couple of custom stopwords that are frequently used in ROAP requests (e.g. piza, request, hey, hi)

In [8]:
# stopwords from stopwords-json
stopwords_json = {"en":["a","a's","able","about","above","according","accordingly","across","actually","after","afterwards","again","against","ain't","all","allow","allows","almost","alone","along","already","also","although","always","am","among","amongst","an","and","another","any","anybody","anyhow","anyone","anything","anyway","anyways","anywhere","apart","appear","appreciate","appropriate","are","aren't","around","as","aside","ask","asking","associated","at","available","away","awfully","b","be","became","because","become","becomes","becoming","been","before","beforehand","behind","being","believe","below","beside","besides","best","better","between","beyond","both","brief","but","by","c","c'mon","c's","came","can","can't","cannot","cant","cause","causes","certain","certainly","changes","clearly","co","com","come","comes","concerning","consequently","consider","considering","contain","containing","contains","corresponding","could","couldn't","course","currently","d","definitely","described","despite","did","didn't","different","do","does","doesn't","doing","don't","done","down","downwards","during","e","each","edu","eg","eight","either","else","elsewhere","enough","entirely","especially","et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex","exactly","example","except","f","far","few","fifth","first","five","followed","following","follows","for","former","formerly","forth","four","from","further","furthermore","g","get","gets","getting","given","gives","go","goes","going","gone","got","gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't","have","haven't","having","he","he's","hello","help","hence","her","here","here's","hereafter","hereby","herein","hereupon","hers","herself","hi","him","himself","his","hither","hopefully","how","howbeit","however","i","i'd","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch","inc","indeed","indicate","indicated","indicates","inner","insofar","instead","into","inward","is","isn't","it","it'd","it'll","it's","its","itself","j","just","k","keep","keeps","kept","know","known","knows","l","last","lately","later","latter","latterly","least","less","lest","let","let's","like","liked","likely","little","look","looking","looks","ltd","m","mainly","many","may","maybe","me","mean","meanwhile","merely","might","more","moreover","most","mostly","much","must","my","myself","n","name","namely","nd","near","nearly","necessary","need","needs","neither","never","nevertheless","new","next","nine","no","nobody","non","none","noone","nor","normally","not","nothing","novel","now","nowhere","o","obviously","of","off","often","oh","ok","okay","old","on","once","one","ones","only","onto","or","other","others","otherwise","ought","our","ours","ourselves","out","outside","over","overall","own","p","particular","particularly","per","perhaps","placed","please","plus","possible","presumably","probably","provides","q","que","quite","qv","r","rather","rd","re","really","reasonably","regarding","regardless","regards","relatively","respectively","right","s","said","same","saw","say","saying","says","second","secondly","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sensible","sent","serious","seriously","seven","several","shall","she","should","shouldn't","since","six","so","some","somebody","somehow","someone","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specified","specify","specifying","still","sub","such","sup","sure","t","t's","take","taken","tell","tends","th","than","thank","thanks","thanx","that","that's","thats","the","their","theirs","them","themselves","then","thence","there","there's","thereafter","thereby","therefore","therein","theres","thereupon","these","they","they'd","they'll","they're","they've","think","third","this","thorough","thoroughly","those","though","three","through","throughout","thru","thus","to","together","too","took","toward","towards","tried","tries","truly","try","trying","twice","two","u","un","under","unfortunately","unless","unlikely","until","unto","up","upon","us","use","used","useful","uses","using","usually","uucp","v","value","various","very","via","viz","vs","w","want","wants","was","wasn't","way","we","we'd","we'll","we're","we've","welcome","well","went","were","weren't","what","what's","whatever","when","whence","whenever","where","where's","whereafter","whereas","whereby","wherein","whereupon","wherever","whether","which","while","whither","who","who's","whoever","whole","whom","whose","why","will","willing","wish","with","within","without","won't","wonder","would","wouldn't","x","y","yes","yet","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves","z","zero"]}
stopwords_json_en = set(stopwords_json['en'])
# adding some of our own stop words to NLTK's stopword list
stopwords_nltk_en = set(stopwords.words('english') + ['though','pizza', 'request', 'hey', 'hi'])
# create stop word list for punctuation
stopwords_punct = set(punctuation)
# combine the three stopwords lists
stoplist_combined = set.union(stopwords_json_en, stopwords_nltk_en, stopwords_punct)

# Create text preprocessor

Added the ability to replace contractions with their proper form and remove URLS to the text preprocessor

In [9]:
wnl = WordNetLemmatizer()

def penn2morphy(penntag):
    """ Converts Penn Treebank tags to WordNet. """
    morphy_tag = {'NN':'n', 'JJ':'a',
                  'VB':'v', 'RB':'r'}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return 'n' 
    
def lemmatize_sent(text): 
    # Text input is string, returns lowercased strings.
    return [wnl.lemmatize(word.lower(), pos=penn2morphy(tag)) 
            for word, tag in pos_tag(word_tokenize(text))]

def preprocess_text(text):
    # Input: str, i.e. document/sentence
    # Output: list(str) , i.e. list of lemmas
    # added functions to replace contractions with their proper form
    # and remove any URLs
    processed_list = [word for word in lemmatize_sent(contractions.fix(re.sub(r'http\S+', '', text)))
            if word not in stoplist_combined
            and not word.isdigit()]
    processed_text = ' '.join(processed_list) 
    return processed_text

# Using TFIDFVectorizer for Bigrams

In [19]:
# Initialize bigram TDIFD vectorizer 
tf_bigram = TfidfVectorizer(analyzer = 'word', preprocessor = preprocess_text, ngram_range = (2, 2))
short_train_df_tf_bigram = tf_bigram.fit_transform(short_train_df['request_text_edit_aware'])

# average frequency of ngrams
avg_values = short_train_df_tf_bigram.toarray().sum(axis=0)
# list of ngrams
vocab = tf_bigram.vocabulary_
df_bigram = pd.DataFrame(sorted([(avg_values[i],k) for k,i in vocab.items()], reverse=True)
            ).rename(columns={0: 'sum frequency', 1:'bigram'})

In [20]:
df_bigram[:20]

Unnamed: 0,sum frequency,bigram
0,43.409068,pay forward
1,20.213445,college student
2,17.103988,return favor
3,16.499367,sob story
4,15.310112,bank account
5,14.238537,lose job
6,11.831305,papa john
7,11.485988,pay back
8,10.047579,couple day
9,10.012277,pay week


# Topic Modeling using LDA (Latent Dirichlet Allocation)

After using the TDIDF vectorizer to create bigrams, we looked into whether similar bigrams can be grouped into topics. One method we used for topic modeling is LDA.

LDA is an iterative probabilistic model that creates topics using two probability values: P(Word|Topic) and P(Topic|Document). We set the LDA model tocreate 10 topics. First, LDA will randomly assign bigrams to 10 topics. Then it'll use the two probability values to reassign bigrams to topics. The reassignment step will happen over and over again until the model converges. This results is the final 10 topics.

In [13]:
# topics for TFIDF trigrams using LDA
lda = LatentDirichletAllocation(n_components=10)
pipe_lda = make_pipeline(tf_bigram, lda)
pipe_lda.fit(short_train_df['request_text_edit_aware'])
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += ", ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()
print_top_words(lda, tf_bigram.get_feature_names(), n_top_words=5)

Topic #0: pay forward, lose job, return favor, college student, sob story
Topic #1: pay forward, money friday, lose job, papa john, sob story
Topic #2: pay forward, college student, sob story, hungry college, ramen noodle
Topic #3: pay forward, college student, find job, job pay, return favor
Topic #4: pay forward, college student, return favor, sob story, bank account
Topic #5: pay forward, college student, bank account, love forever, return favor
Topic #6: pay forward, papa john, sob story, pay rent, story random
Topic #7: pay forward, pay week, college student, sob story, bank account
Topic #8: pay forward, bank account, sob story, return favor, money food
Topic #9: pay forward, return favor, college student, lose job, bank account



# Topic Modeling using NMF (Non-negative Matrix Factorization)

Another topic modeling method we used was NMF. NMF is a linear algebreic model that moves from higher dimensionality to lower dimensionality. It makes a given document-word matrix (our TDIDF matrix) and factors it into two lower dimensional forms: one is a vector of topics and another is a matrix of topic weights for each document. The two lower dimensionality forms are calculated by iterating over them until them optimize and coverge.

Below we used NMF to create 10 topics like we did above with LDA modeling. From NMF we saw topics that seemed to make more sense and have less repetition.

In [16]:
## topics for TFIDF trigrams using NMF
nmf = NMF(n_components=10)
pipe = make_pipeline(tf_bigram, nmf)
pipe.fit(short_train_df['request_text_edit_aware'])
print_top_words(nmf, tf_bigram.get_feature_names(), n_top_words=5)

Topic #0: pay forward, promise pay, forward money, forward pay, forward feel
Topic #1: college student, broke college, hungry college, poor college, student pay
Topic #2: return favor, gladly return, favor paycheck, love return, title return
Topic #3: sob story, story hungry, title pretty, pretty sob, hungry bore
Topic #4: lose job, job week, week ago, job month, job couple
Topic #5: pay week, forward pay, week pay, week buy, reciprocate pay
Topic #6: bank account, account information, cent bank, past week, dollar bank
Topic #7: pay back, pay friday, money food, eat today, day pay
Topic #8: papa john, gift card, domino hut, full time, domino papa
Topic #9: couple day, make day, past couple, live ramen, ramen past



In [17]:
## topics for TFIDF trigrams using NMF for those who did get pizza
nmf = NMF(n_components=10)
pipe = make_pipeline(tf_bigram, nmf)
pipe.fit(short_train_df.request_text_edit_aware[short_train_df.requester_received_pizza == True])
print_top_words(nmf, tf_bigram.get_feature_names(), n_top_words=5)

Topic #0: pay forward, forward money, forward feel, promise pay, grateful pay
Topic #1: sob story, money food, title pretty, find heart, hungry nice
Topic #2: college student, couple day, poor college, live ramen, past couple
Topic #3: pay week, reciprocate pay, leave house, cereal egg, house mood
Topic #4: return favor, pay friday, love return, day pay, fee day
Topic #5: lose job, make day, job week, week ago, food low
Topic #6: bank account, empty bank, pay bank, dollar bank, account pay
Topic #7: day love, eat day, meal day, love nice, love eat
Topic #8: papa john, john town, code papa, gift card, hut papa
Topic #9: food fridge, news bad, landlord power, complex careful, fridge back



In [18]:
## topics for TFIDF trigrams using NMF for those who did not get piza
nmf = NMF(n_components=10)
pipe = make_pipeline(tf_bigram, nmf)
pipe.fit(short_train_df.request_text_edit_aware[short_train_df.requester_received_pizza == False])
print_top_words(nmf, tf_bigram.get_feature_names(), n_top_words=5)

Topic #0: pay forward, promise pay, money pay, forward pay, hungry money
Topic #1: college student, broke college, hungry college, student pay, break college
Topic #2: return favor, gladly return, favor paycheck, title return, favor pas
Topic #3: sob story, story hungry, hungry bore, random act, give sob
Topic #4: lose job, job month, job couple, job week, month ago
Topic #5: pay back, pay friday, love pay, pay day, hungry pay
Topic #6: bank account, account information, cent bank, past week, make day
Topic #7: money food, eat today, food eat, couple day, day ago
Topic #8: pay week, forward pay, week buy, week pay, buy pay
Topic #9: papa john, gift card, domino hut, full time, alfys papa

