In [30]:
import nltk
import numpy as np
import collections
import re
import pandas as pd
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import SnowballStemmer
from csv_helper import CSVHelper
nltk.download('stopwords')
nltk.download('wordnet')

data = CSVHelper.load_csv("Tweets_2016London.csv")

[nltk_data] Downloading package stopwords to /home/jeroen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jeroen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
starting for loop!


In [31]:
def tokenize(tweets):
    tknzr = TweetTokenizer()
    tokenized_tweets = []
    for tweet in tweets:
        tokenized_tweets.append(tknzr.tokenize(tweet))
    return tokenized_tweets

def remove_stopwords(tweets):
    stopwords =  nltk.corpus.stopwords.words('english')
    tweets_nostop = []
    for tweet in tweets:
        tweet_nostop = [w.lower() for w in tweet if w.lower() not in stopwords]
        tweets_nostop.append(tweet_nostop)
    return tweets_nostop

def filter_noise(tweets):
    tweets_filtered = []
    for tweet in tweets:
        tweet_filtered = [re.sub("[^a-zA-Z0-9]",'', w) for w in tweet if re.sub("[^a-zA-Z0-9]",'', w)!= '']
        tweets_filtered.append(tweet_filtered)
    return tweets_filtered

def remove_url(tweets):
    tweets_filtered = []
    for tweet in tweets:
        tweet_filtered = [re.sub(r'^https?:\/\/.*[\r\n]*', '', w) for w in tweet]
        tweets_filtered.append(tweet_filtered)
    return tweets_filtered

#we decided to use a lemmatizer instead of stemming 
def my_stem(tweets):
    lmt = WordNetLemmatizer()
    ps = PorterStemmer()
    tweets_stemmed = []
    for tweet in tweets:
        tweet_stemmed = [lmt.lemmatize(w) for w in tweet]
        tweets_stemmed.append(tweet_stemmed)
    return tweets_stemmed

In [32]:
all_tweets = tokenize(data)
all_tweets = remove_url(all_tweets)
all_tweets = filter_noise(all_tweets)
all_tweets = remove_stopwords(all_tweets)
all_tweets = my_stem(all_tweets)

The function create_tfidf_features  (input = array of tokenized tweets output = tf-idf feature matrix) Creates a distinct vocabulary set, and a dictionary (key val pairs)for each word in the vocabulary, calculated the IDF value, and for each term in each tweet, calculates the tfidf score (which is just the term frequency * IDF value)


https://nlpforhackers.io/tf-idf/

https://www.youtube.com/watch?v=4vT4fzjkGCQ&t=1s

http://billchambers.me/tutorials/2014/12/21/tf-idf-explained-in-python.html

In [51]:
def termfreq(word, tweet):
    #print("tweet:", doc, "length of tweet:", len(doc))
    num_words = len(tweet)
    word_occurences = tweet.count(word)
    return word_occurences/num_words

def count_doc_with_term(word, tweets):
    counter = 0
    for tweet in tweets:
        if word in tweet:
            counter = counter + 1
    return counter

#idf = amount_of_tweets/amount_of_tweets_with_word
def idf(word, tweets):
    num_tweets = len(tweets)
    num_tweets_with_word = count_doc_with_term(word, tweets)
    return np.log10(num_tweets/num_tweets_with_word)

def create_vocabulary(tweets):
    vocabulary = set()
    for tweet in tweets:
        words = [w for w in tweet]
        vocabulary.update(words)
    return list(vocabulary)

def compute_word_idf(tweets, vocabulary):
    word_idf = collections.defaultdict(lambda: 0)      
    for w in vocabulary:
        word_idf[w] = idf(w, tweets)
    return word_idf
    
def tf_idf(word, tweet, tweets):
    if tweet != []:
        return termfreq(word, tweet) * word_idf[word]
    else:
        return 0
    
def tweet_tfidf_features(tweets):
    vocabulary = create_vocabulary(tweets)
    word_idf = compute_word_idf(tweets, vocabulary)
    tweet_features = np.empty(len(tweets), dtype=object)
    for i, tweet in enumerate(tweets):
        a = np.zeros(len(vocabulary))
        for j, word in enumerate(vocabulary):
            a[j] = tf_idf(word, tweet, tweets)
        tweet_features[i] = a
    return np.vstack(tweet_features)

In [52]:
#test = [["this","a","a","is","sample"],["this","another","another","is","example","example","example"]]
#test2 = [["new","times","york"],["new","post","york"],["angeles","los","times"]]
feature_matrix = tweet_tfidf_features(all_tweets)

In [53]:
def cosine_distance(v1,v2):
    nomer = np.dot(v1, v2)
    denomer = np.dot(np.linalg.norm(v1), np.linalg.norm(v2))
    return nomer/denomer

In [54]:
#TODO from here
def calculate_distances(features):
    size = len(features)
    distances = np.zeros((size, size))
    for i in range(len(distances)):
        for j in range(len(distances)):
            distances[i,j] = cosine_distance(features[i], features[j])
    return distances

#distances = calculate_distances(feature_matrix)

In [55]:
from sklearn.metrics.pairwise import cosine_distances
from sklearn.metrics.pairwise import cosine_similarity