In [92]:
import pandas as pd
from nltk import tokenize, stem
from nltk.corpus import stopwords
from re import sub
from unidecode import unidecode
from multiprocessing import Pool
import math
import string

In [107]:
def tokenizer(tweets):
    
    """
    Arguments:
    
    tweets - List of strings, the strings being tweets.
    -----------------------------------------------------------------
    Return-value:
    
    tokens - 2-dimensional list containing tokens as strings.
    
    """
    
    tokens = list()
    tk = tokenize.TweetTokenizer(strip_handles=True, reduce_len=True, preserve_case=False)
    for tweet in tweets:
        try:
            element = tk.tokenize(tweet)
        except UnicodeDecodeError:
            element = []
        tokens.append(element)
    return tokens


def normalize(tokens):
    
    """
    Arguments:
    
    tokens - List of lists containing tokens of the tweets.
    -----------------------------------------------------------------
    Return-value:
    
    final_tokens - List of lists containing normalized tokens of the tweets.
    
    """
    
    stop = stopwords.words('english')
    exclude = set(string.punctuation)
    stemmer = stem.PorterStemmer()
    
    # Unicode to string
    tokens_str = [[unidecode(token) for token in tweet] for tweet in tokens]
    
    #Replacing URLs with empty string
    tokens_str = [[sub(r"http\S+", "", token) for token in tweet] for tweet in tokens_str]
    
    # Removing punctuation    
    tokens_str = [[''.join(ch for ch in token if ch not in exclude) for token in tweet] for tweet in tokens_str]
    
    # Removing numbers
    tokens_str = [[sub(r'\d+', '', token) for token in tweet] for tweet in tokens_str]
    
    # Stemming
    tokens_str = [[stemmer.stem(token) for token in tweet] for tweet in tokens_str]
    
    # Unicode to string
    tokens_str = [[unidecode(token) for token in tweet] for tweet in tokens_str]
    
    # Removing stopwords
    tokens_str = [[word for word in tweet if word not in stop] for tweet in tokens_str]
    
    # Removing empty tokens (strings)
    final_tokens = list()
    for str_list in tokens_str:
        x = filter(None, str_list)
        final_tokens.append(x)
    
    return final_tokens


def createInvertedIndex(tokens):
    
    """
    Arguments:
    
    tokens - List of lists containing normalized tokens.
    ------------------------------------------------------------------
    Return-value:
    
    inverted_index - Dictionary containing the index words.
    
    """
    
    inverted_index = {}
    for i in range(len(tokens)):
        for j in range(len(tokens[i])):
            if tokens[i][j]:
                if inverted_index.has_key(tokens[i][j]):
                    if i not in inverted_index[tokens[i][j]]:
                        inverted_index[tokens[i][j]].append(i)
                else:
                    inverted_index[tokens[i][j]] = [i]
#         print(i)
    return inverted_index


def bm25(query_tokens, index, tokens):
    
    map_id_score = {}
    k1 = 1.2
    b = 0.75 
    N = len(tokens) # Number of documents in the collection
    adder = 0
    
    for doc_tokens in tokens:
        adder += len(doc_tokens)
    
    avg_doclen = float(adder)/N
    
    for j in range(len(tokens)):
        
        lend = float(len(tokens[j]))
        score = 0
        
        for i in range(len(query_tokens)):
            
            if query_tokens[i] in tokens[j]:
                
                n = float(len(index[query_tokens[i]]))
                f = float(tokens[j].count(query_tokens[i]))
                T1 = math.log(float(N-n+0.5)/(n+0.5),2)
                x = k1 * ((1-b) + b*(lend/avg_doclen)) + f
                T2 = float((k1+1)*f)/x
                score += T1*T2
        
        map_id_score[j] = score
        
    return map_id_score



In [108]:
print("Reading training and testing dataset...")

df_full = pd.read_csv('stanford/training.1600000.processed.noemoticon.csv', header=None)
df_test = pd.read_csv('stanford/testdata.manual.2009.06.14.csv', header=None)

print("Done.")



Reading training and testing dataset...
Done.


In [109]:
df1 = df_full.iloc[:10000]
df2 = df_full.iloc[800000:810000]
df = pd.concat([df1,df2])
tweets = list(df[5])
df.reset_index(inplace=True, drop=True)

In [110]:
print("Mapping tweetIDs to tweet and sentiment...")

map_id_tweet = {}
map_id_sentiment = {}

for i in range(len(df)):
    map_id_tweet[i] = df.iloc[i][5]
    map_id_sentiment[i] = df.iloc[i][0]

# for i in range(len(df2)):
#     map_id_tweet[i] = df2.iloc[i][5]
#     map_id_sentiment[i] = df2.iloc[i][0]

print("Done.")
# pool = Pool()
# tokens = pool.map(tokenizer, tweets)



Mapping tweetIDs to tweet and sentiment...
Done.


In [111]:
# Tokenizing the tweeets
print("Tokenizing tweets...")
tokenized_tokens = tokenizer(tweets)
print("Done.")

print(tokenized_tokens[:3])

Tokenizing tweets...
Done.
[[u'http://twitpic.com/2y1zl', u'-', u'awww', u',', u"that's", u'a', u'bummer', u'.', u'you', u'shoulda', u'got', u'david', u'carr', u'of', u'third', u'day', u'to', u'do', u'it', u'.', u';D'], [u'is', u'upset', u'that', u'he', u"can't", u'update', u'his', u'facebook', u'by', u'texting', u'it', u'...', u'and', u'might', u'cry', u'as', u'a', u'result', u'school', u'today', u'also', u'.', u'blah', u'!'], [u'i', u'dived', u'many', u'times', u'for', u'the', u'ball', u'.', u'managed', u'to', u'save', u'50', u'%', u'the', u'rest', u'go', u'out', u'of', u'bounds']]


In [112]:
# Normalizing the tweets
print("Normalizing tweets...")
normalized_tokens = normalize(tokenized_tokens)
print("Done.")
print(normalized_tokens[:3])

Normalizing tweets...
Done.
[['awww', 'bummer', 'shoulda', 'got', 'david', 'carr', 'third', 'day', 'D'], ['upset', 'cant', 'updat', 'hi', 'facebook', 'text', 'might', 'cri', 'result', 'school', 'today', 'also', 'blah'], ['dive', 'mani', 'time', 'ball', 'manag', 'save', 'rest', 'go', 'bound']]


In [113]:
# Creating Inverted Index
print("Creating Inverted Index...")
index = createInvertedIndex(normalized_tokens)
print("Done.")


Creating Inverted Index...
Done.


In [114]:
print("Preprocessing test queries...")
queries = list(df_test[5])
tokens1 = tokenizer(queries)
query_tokens = normalize(tokens1)
# print(query_tokens)
print("Done.")



Preprocessing test queries...
Done.


In [115]:
print("\nNumber of tweets in the corpus: " + str(len(tweets)))
print("Number of index terms: " + str(len(index)))

print("\nPrinting the top 10 results for each query:\n\n")
for i in range(300,350):
    map_id_score = bm25(query_tokens[i], index, normalized_tokens)
    data = map_id_score.items()
    sortedlist = sorted(data, key=lambda x: x[1],reverse=True)
    print("\n\nQuery : " + str(queries[i]))
    print("\nMost Relevant Results : \n")
    for j in range(10):
        print("[" + str(j+1) + "] : (Score = " + str(round(sortedlist[j][1],4)) + ") " + df.iloc[sortedlist[j][0]][5])



Number of tweets in the corpus: 20000
Number of index terms: 15632

Printing the top 10 results for each query:




Query : How do you use the twitter API?... http://bit.ly/4VBhH

Most Relevant Results : 

[1] : (Score = 15.8005) @seoexpertbd that takes twitter password. so it makes everyone a follower of everyone. Doable via Twitter oAuth and other APIs 
[2] : (Score = 13.9393) Can't use Twitter with my phone 
[3] : (Score = 13.0495) finding it really hard to use twitter 
[4] : (Score = 13.0495) @sallyinnorfolk awww, you'll have to use Twitter for company 
[5] : (Score = 13.0495) is getting used to this twitter stuff 
[6] : (Score = 12.2664) I think I should not use twitter as I offend people unintentionally 
[7] : (Score = 12.2664) @blooms5887 Just use Tweetdeck and you can update facebook and twitter at the same time 
[8] : (Score = 12.2664) thinks &quot;stop using facebook and follow me on twitter  http://twitter.com/paulrjmellors
[9] : (Score = 12.2664) The twitter task.. It exis

In [88]:

# Removing periods in abbreviations. Ex: U.S.A. to USA

# for tweet in tokens:
#     tweet = [sub(r'(?<!\w)([A-Z])\.', r'\1', x.lower()) for x in tweet]
#     print (tweet)


In [89]:
from nltk.corpus import stopwords

In [90]:
english_stopwords = stopwords.words('english')

In [39]:
import nltk

In [40]:
nltk.download()

showing info http://www.nltk.org/nltk_data/


True

In [45]:
stri = 'This tweet is a sample tweet bro this is the best'

In [56]:
x = [word for word in stri.split() if word not in stop]

In [58]:
x.append('')

In [59]:
x

['This', 'tweet', 'sample', 'tweet', 'bro', 'best', '']

In [60]:
y = filter(None, x)

In [62]:
y = []

In [63]:
y

[]

In [64]:
x = filter(None, y)

In [65]:
x

[]

In [93]:
stemmer = stem.PorterStemmer()

u'Do'