In [190]:
import pandas as pd
import numpy as np
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import gensim

In [191]:
REDDIT_DATA_PATH = '../data_harvester/resources/data/reddit'
TWITTER_DATA_PATH = '../data_harvester/resources/data/twitter'

SHARED_COLUMNS = ['id', 'created', 'text', 'where']

WORD2VEC_PATH = '../data_harvester/resources/models/GoogleNews-vectors-negative300.bin.gz'

In [192]:
def load_csv_files(files_dir, sep='|'):
    
    files = (pd.read_csv(os.path.join(files_dir ,f), sep=sep) for f in os.listdir(files_dir) if f.endswith('.csv'))
    df = pd.concat(files, ignore_index=True)
    
    return df.drop_duplicates()#.reset_index(drop=True)

def combine_twitter_and_reddit_data(twitter_data, reddit_data):
    
    twitter_data = twitter_data[['id_str', 'created_at', 'full_text', 'user.screen_name']]
    twitter_data.columns=SHARED_COLUMNS
    
    reddit_data = reddit_data[['id', 'created', 'title', 'subreddit']]
    reddit_data.columns=SHARED_COLUMNS
    
    combined = pd.concat((twitter_data, reddit_data), ignore_index=True)
    
    combined['date'] = pd.to_datetime(combined['created'], utc=True).dt.date
    combined['time'] = pd.to_datetime(combined['created'], utc=True).dt.time
    
    return combined.drop('created', axis=1)
    

In [193]:
reddit_data = load_csv_files(REDDIT_DATA_PATH)
twitter_data = load_csv_files(TWITTER_DATA_PATH)

In [172]:
data = combine_twitter_and_reddit_data(twitter_data, reddit_data)

In [121]:
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(
    WORD2VEC_PATH, 
    binary = True
)

In [173]:
punctuation_traslator = str.maketrans('', '', string.punctuation)
stop_words = set(stopwords.words('english'))


def text2vec(text, vec_len=300):
    text = text.lower()
    text = text.translate(punctuation_traslator)
    text = nltk.word_tokenize(text)
    filtered_sentence = [w for w in text if not w in stop_words]
    i = 1
    vector_representation = np.zeros((1,vec_len))

    for word in filtered_sentence:
        try: 
            vector_representation = vector_representation + word2vec_model.wv[word]
            i = i + 1
        except KeyError:
            i = i
    vector_representation = np.divide(vector_representation, i)
    return(vector_representation[0])


In [174]:
def generate_daily_text_features(data):
    # TODO: Optimize this, its slow!
    data['textvec'] = data['text'].apply(text2vec)
    text_features = pd.DataFrame(data['textvec'].to_list(), columns=[f'textvec_{i}' for i in range(300)])
    text_features['date'] = data['date']
    return text_features.groupby('date').agg('mean')

  from ipykernel import kernelapp as app


In [None]:
text1 = text2vec('This is going bad')
text2 = text2vec('This is going great')
np.dot(text1, text2)/(np.linalg.norm(text1)*np.linalg.norm(text2))