In [1]:
%pylab inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns ## Beautiful Plots :)

Populating the interactive namespace from numpy and matplotlib


In [2]:
import os
import re
import operator
import datetime
from collections import Counter
from nltk import ngrams
from utils import *

In [24]:
class Random_Acts_Of_Pizza(object):
    """
    This class takes in a dataframe from "Random Acts Of Pizza"
    and creates a set of features for each requests present in
    the data. The features can be explicitly accessed with the 
    help of methods of this class.
    """
    
    def __init__(self,dataframe):
        """
        The input is a dataframe. We have explicitly used
        data provided from the Kaggle competition named as
        Random Acts Of Pizza.
        Link : https://www.kaggle.com/c/random-acts-of-pizza/data
        """
        self.df = dataframe
    
    def __str__(self):
        return str(self.df.info())
    
    def get_narrative(self,col,narrative):
        """
        Takes in input as the column and a lexicon for
        the narrative. The lexicon is in the form of a 
        dictionary where key in the narrative and value
        is a string containing all words relating to the
        narrative.
        Outputs a list containing narrative score for
        each request
        """
        request_narrative,narration = [],[]
        for request in self.df[col]:
            word_count = {'Money':0,'Job':0,'Student':0,'Family':0,'Craving':0}
            n = 0
            for word in request.split():
                for lexicon in narrative:
                    if word in narrative[lexicon]:
                        word_count[lexicon] += 1
            narration.append(max(word_count.iteritems(), key=operator.itemgetter(1))[0])
        print 'Use get_dummies to encode the features as binary'
        return narration
    
    def get_politeness(self,col,polite_words):
        """
        Takes in input as the column for which politeness
        needs to be calculated.
        Output is a list of floats for each request where
        each float corresponds to the politeness score for
        each request.
        """
        count,politeness = 0,[]
        for request in self.df[col]:
            count += 1
            request_ngrams = []
            for grams in ngrams(request.split(),3):
                request_ngrams.append(' '.join(grams))
            for grams in ngrams(request.split(),2):
                request_ngrams.append(' '.join(grams))
            request_words = set(request.split())
            request_ngrams = set(request_ngrams)
            num = len(request_words.intersection(set(polite_words))) +\
            len(request_ngrams.intersection(set(polite_words)))
            try:
                politeness.append(float(num)/len(request_words))
            except:
                politeness.append(0.0)
        print 'Total Number of request parsed: ',count
        return politeness
    
    def get_length(self,col):
        """
        Takes input as the column name(for the request)
        Outputs the length 
        """
        return [len(x.split()) for x in self.df[col]]
    
    def get_karma(self):
        """
        Calculate the karma score for each user of the
        RAOP.
        Output is a list of karma score for each requester.
        """
        karma = self.df['requester_upvotes_plus_downvotes_at_request']+\
        self.df['requester_upvotes_plus_downvotes_at_retrieval']
        return karma
    
    def get_score(self):
        """
        Calculates the score of each user of RAOP.
        Output is a list of score for each user.
        """
        score = self.df['requester_upvotes_minus_downvotes_at_request']+\
        self.df['requester_upvotes_minus_downvotes_at_retrieval']
        return score
    
    def get_evidentiality(self,col):
        """
        We count the occur- rences of http links, image links,
        and “proof”/“prove”
        Returns a list of number of urls for each request.
        """
        urls = []
        for text in self.df[col]:
            url = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
            urls.append(len(url))
        return urls
    
    def get_complexity(self,col):
        """
        Takes input as the column of request and calculate
        the complexity score given by Flesch-Kincaid Grade 
        Level.
        Returns a list of complexity score for each request.
        """
        grade_level = []
        syl = set(['a','e','i','o','u'])
        for text in self.df[col]:
            sent_cnt = len(text.split('.'))
            words = re.sub("["+'!"#$%&\'()*+.,-/:;<=>?@[\\]^_`{|}~'+"]", " ", text).split()
            syl_count = 0
            for word in words:
                for letter in list(word):
                    if letter in syl:
                        syl_count += 1
            grade_level.append(Flesch_reading_ease(total_sentences = sent_cnt,total_words = len(words),\
                                                   total_sylabls=syl_count))
        return grade_level
    
    def spell_check_score(self,col):
        """
        We further use a spell-checker to identify misspelled 
        words in the request text [10]. In other contexts 
        (e.g. Kickstarter), spelling errors have been found to
        have a negative impact on funding success.
        
        Input is the column of request.
        Returns a list of spelling check score for each
        request
        """
        spell_errors = []
        for text in self.df[col]:
            spl_err = 0
            words = re.sub("["+'!"#$%&\'()*+.,-/:;<=>?@[\\]^_`{|}~'+"]", " ", text).split()
            if len(words):
                for word in words:
                    if correction(word)!= word: 
                        spl_err += 1
                spell_errors.append(float(spl_err)/len(words))
            else:
                spell_errors.append(0)
        return spell_errors
    
    def first_half_of_month(self,col):
        """
        Input is the column containing the timestamp
        Returns the day of the timestamp for a request
        """
        return [datetime.datetime.fromtimestamp(int(timestamp)).day \
                for timestamp in self.df[col]]
    
    def get_popularity(self,col):
        """
        Input is the column containing the total upvotes
        the request got at time of retrieval
        """
        return self.df[col]

    

In [4]:
df = pd.read_json('../Desktop/train.json')

In [25]:
raop = Random_Acts_Of_Pizza(df)
print raop

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4040 entries, 0 to 4039
Data columns (total 43 columns):
giver_username_if_known                                 4040 non-null object
number_of_downvotes_of_request_at_retrieval             4040 non-null int64
number_of_upvotes_of_request_at_retrieval               4040 non-null int64
post_was_edited                                         4040 non-null int64
request_id                                              4040 non-null object
request_number_of_comments_at_retrieval                 4040 non-null int64
request_text                                            4040 non-null object
request_text_edit_aware                                 4040 non-null object
request_title                                           4040 non-null object
requester_account_age_in_days_at_request                4040 non-null float64
requester_account_age_in_days_at_retrieval              4040 non-null float64
requester_days_since_first_post_on_raop_at_reque

In [26]:
narrative = {'Money': 'money now broke week until time last \
day when today tonight paid next first night after tomorrow \
month while account before long Friday rent buy bank still \
bills bills ago cash due due soon past never paycheck check \
spent years poor till yesterday morning dollars financial \
hour bill evening credit budget loan bucks deposit dollar \
current payed'.split(),'Job':'work job paycheck unemployment\
interview fired employment hired hire'.split(),'Student':'college\
student school roommate studying university finals semester class\
study project dorm tuition'.split(),'Family':'family mom wife parents\
mother hus- band dad son daughter father parent mum'.split(),'Craving':'friend \
girlfriend craving birthday boyfriend celebrate party game games movie\
date drunk beer celebrating invited drinks crave wasted invite'.split()}

In [27]:
requests = 'request_text_edit_aware'
narrative = raop.get_narrative(col=requests,narrative=narrative)

Use get_dummies to encode the features as binary


In [28]:
df['narrative_topics'] = narrative
df = pd.get_dummies(df,columns=['narrative_topics'])

In [29]:
polite_words = [
    "please","thanks","thank you","think", "thought", "thinking", "almost",
    "apparent", "apparently", "appear", "appeared", "appears", "approximately", "around",
    "assume", "assumed", "certain amount", "certain extent", "certain level", "claim",
    "claimed", "doubt", "doubtful", "essentially", "estimate",
    "estimated", "feel", "felt", "frequently", "from our perspective", "generally", "guess",
    "in general", "in most cases", "in most instances", "in our view", "indicate", "indicated",
    "largely", "likely", "mainly", "may", "maybe", "might", "mostly", "often", "on the whole",
    "ought", "perhaps", "plausible", "plausibly", "possible", "possibly", "postulate",
    "postulated", "presumable", "probable", "probably", "relatively", "roughly", "seems",
    "should", "sometimes", "somewhat", "suggest", "suggested", "suppose", "suspect", "tend to",
    "tends to", "typical", "typically", "uncertain", "uncertainly", "unclear", "unclearly",
    "unlikely", "usually", "broadly", "tended to", "presumably", "suggests",
    "from this perspective", "from my perspective", "in my view", "in this view", "in our opinion",
    "in my opinion", "to my knowledge", "fairly", "quite", "rather", "argue", "argues", "argued",
    "claims", "feels", "indicates", "supposed", "supposes", "suspects", "postulates"
]

In [30]:
politeness = raop.get_politeness(col=requests,polite_words=polite_words)
df['politeness'] = politeness

Total Number of request parsed:  4040


In [31]:
df['request_length'] = raop.get_length(requests)
print len(df.request_length)

4040


In [32]:
df['karma'] = raop.get_karma()
print len(df.karma)

4040


In [33]:
df['scores'] = raop.get_score()
print len(df.scores)

4040


In [34]:
df['trust'] = raop.get_evidentiality(requests)
print len(df.trust)

4040


In [35]:
df['complexity'] = raop.get_complexity(requests)
print len(df.complexity)

4040


In [37]:
df['popularity'] = raop.get_popularity('number_of_upvotes_of_request_at_retrieval' )
print len(df.popularity)

4040


In [None]:
if not os.path.isfile('spellCheck.csv'):
    df['spell_check_score'] = raop.spell_check_score(requests)
    df['spell_check_score'].to_csv('spellCheck.csv')
    print df.spell_check_score.head(2)
else:
    print 'File exists...'