In [2]:
import pandas as pd
from collections import Counter
import math

In [3]:
# read in the tweets
traintweets = pd.read_csv('traintweets.csv', sep='\t')
apptweets = traintweets[traintweets['Class'] == 'APP']
othertweets = traintweets[traintweets['Class'] == 'OTHER']

# drop any empty rows that are read in
apptweets = apptweets.dropna()
othertweets = othertweets.dropna()

In [4]:
# a function to clean up the tweets up a bit, by making them lowercase and replacing common punctuation with spaces
def cleantext(string):
    string = string.lower()
    string = string.replace(". ", " ")
    string = string.replace(": ", " ")
    string = string.replace("?"," ")
    string = string.replace("!"," ")
    string = string.replace(";", " ")
    string = string.replace(",", " ")
    return string

In [17]:
# apply the function to all the tweets using list comphrehension
apptweets['Tweet'] = [cleantext(i) for i in apptweets['Tweet']]
othertweets['Tweet'] = [cleantext(i) for i in othertweets['Tweet']]

In [19]:
othertweets

Unnamed: 0,Class,Tweet
0,OTHER,¿en donde esta su remontada mandrill
1,OTHER,.@katie_phd alternate 'reproachful mandrill' ...
2,OTHER,".@theophani can i get ""drill"" in there it wou..."
3,OTHER,“@chrisjboyland baby mandrill paignton zoo 29t...
4,OTHER,“@missmya #nameanamazingband mandrill ” mint c...
5,OTHER,“fat city strut” by mandrill is my new jam htt...
6,OTHER,soul train #22 1973 mandrill ...
7,OTHER,@alicegreennn_ but how come you didn't have ma...
8,OTHER,@as_tomasroncero a la mierda el mandrill tocat...
9,OTHER,@burnziey @sjsharkfinatic i have zach mandrill...


In [6]:
# create two Counter objects to keep track of the number of times any token shows up in each dataset
appcnt = Counter()
othercnt = Counter()

# iterate through both dataframes, splitting the Tweet into separate words, and using the Counter objects to sum up the
# frequencies of the tokens
for index, row in apptweets.iterrows():
    for word in row.Tweet.split(' '):
        appcnt[word] += 1

for index, row in othertweets.iterrows():
    for word in row.Tweet.split(' '):
        othercnt[word] += 1

In [7]:
# copy the two previous Counter objects; this allows us to change them while iterating through the originals
appcntclean = appcnt.copy()
othercntclean = othercnt.copy()

In [20]:
othercnt['remontada']

1

In [8]:
# delete any tokens that are less than 4 characters in length
for i in appcnt:
    if len(i) <= 3:
        del appcntclean[i]
        
for i in othercnt:
    if len(i) <= 3:
        del othercntclean[i]

In [9]:
# add 1 to every token count, this is Additive Smoothing (Laplace Smoothing)
# this ensures that later on, if we encounter a word, we can act like we saw it once before
# so we add one to every other token to account for that, and give them more weight
for i in appcntclean:
    appcntclean[i] = appcnt[i] + 1
    
for i in othercntclean:
    othercntclean[i] = othercnt[i] + 1

In [21]:
appcnt['simple']

1

In [23]:
appcnt['api']

9

In [27]:
appcntclean

Counter({'#atl': 2,
         '#atlanta': 2,
         '#bjcbranding': 2,
         '#buddypress': 2,
         '#career': 2,
         '#design': 2,
         '#dev': 2,
         '#drupal': 4,
         '#edocr': 2,
         '#eecms': 3,
         '#enginehosting': 2,
         '#freelance': 8,
         '#freelancer': 2,
         '#howto': 2,
         '#integration': 2,
         '#internetmarketing': 2,
         '#interspire': 3,
         '#javascript': 4,
         '#job': 8,
         '#jobs': 5,
         '#jobs4u': 2,
         '#jquery': 3,
         '#lightweight': 2,
         '#linux': 2,
         '#logo': 2,
         '#mandrill': 4,
         '#mysql': 2,
         '#newsletters': 2,
         '#photoshopdesign': 2,
         '#php': 4,
         '#plone': 3,
         '#plugin': 2,
         '#plugins': 2,
         '#project': 2,
         '#python': 2,
         '#redhen': 2,
         '#selfinducedcannibalization': 2,
         '#sendgrid': 2,
         '#templates': 2,
         '#timetomoveon': 2,


In [10]:
# create two dict objects to contain the probabilities of the words given that they are an app tweet or other tweet
probapp = {}
probother = {}

# add in the logged probabilities into the dict for each respective dataset, the probability being:
# (count of how many times we've seen this token)/(count of all the times we've seen all tokens)
# we log them so we don't run into the floating point underflow error when trying to multiply together many small probablities
# instead by logging them we can just add them since log(a*b) = log(a) + log(b)
for i in appcntclean:
    probapp[i] = math.log(float(appcntclean[i])/sum(appcntclean.values()))

for i in othercntclean:
    probother[i] = math.log(float(othercntclean[i])/sum(othercntclean.values()))

In [12]:
# read in the test dataset that we'll validate against, and clean the tweets using the function we generatd before
testtweets = pd.read_csv('testtweets.csv', sep='\t')
testtweets['Tweet'] = [cleantext(i) for i in testtweets['Tweet']]

# create an empty list to hold the predicted values for the test tweets
results = []

In [13]:
# interate through every row in the test tweets dataframe
for index, row in testtweets.iterrows():
    # to hold the summation of the logged probabilities
    appsum = 0
    othersum = 0
    # tokenize the tweets, and iterate through each word
    for word in row.Tweet.split(' '):
        # ignore any token that's shorter than 4 characters
        if len(word) <= 3:
            appsum += 0
            othersum += 0
        # otherwise, look up the word's probability of being an app or an other tweet, and add it up
        # if we've never seen the word before, just assume that we've seen it once before, and add that logged probability to the sum
        else:
            if word in probapp:
                appsum += probapp[word]
            else:
                appsum += math.log(float(1)/sum(appcntclean.values()))
                
            if word in probother:
                othersum += probother[word]
            else:
                othersum += math.log(float(1)/sum(othercntclean.values()))
    # if the sum of the logged probabilities for app is greater than other, than we'll predict it's an App
    if appsum > othersum:
        results.append('App')
    # otherwise we'll predict that it's an Other
    else:
        results.append('Other')

In [14]:
testtweets['Prediction'] = results

In [15]:
testtweets

Unnamed: 0,Number,Class,Tweet,Prediction
0,1,APP,just love @mandrillapp transactional email ser...,App
1,2,APP,@rossdeane mind submitting a request at http:/...,App
2,3,APP,@veroapp any chance you'll be adding mandrill ...,App
3,4,APP,@elie__ @camj59 jparle de relai smtp 1 million...,App
4,5,APP,would like to send emails for welcome passwor...,App
5,6,APP,"from coworker about using mandrill ""i would en...",App
6,7,APP,@mandrill realised i did that about 5 seconds ...,App
7,8,APP,holy shit it’s here http://www.mandrill.com/,App
8,9,APP,our new subscriber profile page activity timel...,App
9,10,APP,@mandrillapp increases scalability ( http://bi...,App
