In [278]:
import cudf
import nvstrings
from librmm_cffi import librmm
import numpy as np
from collections import OrderedDict
import time
import os

# Nobody likes redefining names and types, when most are identical
def get_dtypes(fn, delim, floats):
    with open(fn, errors='replace') as fp:
        header = fp.readline().strip()
    types = []
    for col in header.split(delim):
        if 'date' in col: types.append((col, 'date'))
        elif col in floats: types.append((col, 'float64'))
        else: types.append((col, 'int64'))
    return OrderedDict(types)

def get_df(fn, dtypes):
    t0 = time.time()
    size = os.path.getsize(fn)
    df = cudf.io.read_csv(fn, names=list(dtypes), dtype=list(dtypes.values()), skiprows=1)
    t1 = time.time()
    rows = len(df)
    print(str(size) + ' bytes in ' + str(t1-t0) + 's: ' + str(size/(t1-t0)/1000/1000) + ' mbytes/sec, ' + str(rows) + ' rows')
    return df


def get_df_and_strings(fn, dtypes):
    t0 = time.time()
    size = os.path.getsize(fn)
    columns = cudf.io.read_csv_strings(fn, names=list(dtypes), dtype=list(dtypes.values()), skiprows=1)
    df = cudf.dataframe.DataFrame()
    strs = []
    for idx,col in enumerate(columns):
        name = list(dtypes.keys())[idx]
        if type(columns[idx]) == cudf.series.Series:
            df[name] = col
        else:
            strs.append(col)
    t1 = time.time()
    rows = len(df)
    print(str(size) + ' bytes in ' + str(t1-t0) + 's: ' + str(size/(t1-t0)/1000/1000) + ' mbytes/sec, ' + str(rows) + ' rows')
    return (df, strs)

def hash_to_series(string):
    #print(string)
    word_array = librmm.device_array(string.size(), dtype=np.int32)
    string.hash(word_array.device_ctypes_pointer.value)
    return cudf.Series(word_array)

In [305]:
test_fn = 'string-test.csv'

lines = ['number,text', '0,great and good', '1,weakest bad', '2,successfully superior', '3,winning']
# must end with '\n'
with open(test_fn, 'w') as fp:
    fp.write('\n'.join(lines)+'\n')
    
dtypes = get_dtypes(test_fn, ',', floats=[])
dtypes['text'] = 'str'
cols = get_df_and_strings(test_fn, dtypes)

test_df = cols[0]
words = cols[1][0]

77 bytes in 0.003477811813354492s: 0.022140358401316243 mbytes/sec, 4 rows


In [281]:
test_df['score'] = np.zeros(len(test_df))
test_df['ones'] = np.ones(len(test_df))

In [282]:
for x in range(0, 3):
    test_df['score'] = test_df['score'] + test_df['ones']
    print(test_df)

  number score ones
0      0   1.0  1.0
1      1   1.0  1.0
2      2   1.0  1.0
3      3   1.0  1.0
  number score ones
0      0   2.0  1.0
1      1   2.0  1.0
2      2   2.0  1.0
3      3   2.0  1.0
  number score ones
0      0   3.0  1.0
1      1   3.0  1.0
2      2   3.0  1.0
3      3   3.0  1.0


In [306]:
# https://sraf.nd.edu/textual-analysis/resources/
dict_fn = '/raid/stack_overflow/dict.csv'

dtypes = get_dtypes(dict_fn, ',', ['Word Proportion', 'Average Proportion', 'Std Dev'])
for col in ['Word', 'Source']:
    dtypes[col] = 'str'

res = get_df_and_strings(dict_fn, dtypes)
tmp_df = res[0]
sentiment_words = res[1][0]

sentiment_df = cudf.dataframe.DataFrame()
sentiment_df['word_hash'] = hash_to_series(sentiment_words.lower())
sentiment_df['pos'] = tmp_df['Positive']
sentiment_df['neg'] = tmp_df['Negative']

9169185 bytes in 0.027340412139892578s: 335.3711331447408 mbytes/sec, 86551 rows


In [217]:
import pandas as pd
df = pd.read_csv(dict_fn)

In [256]:
df.query('Word == "WINNING"')

Unnamed: 0,Word,Sequence Number,Word Count,Word Proportion,Average Proportion,Std Dev,Doc Count,Negative,Positive,Uncertainty,Litigious,Constraining,Superfluous,Interesting,Modal,Irr_Verb,Harvard_IV,Syllables,Source
84102,WINNING,84103,45485,3e-06,2e-06,2.7e-05,23755,0,2009,0,0,0,0,0,0,0,0,2,12of12inf


In [297]:
print(words.lower().split_column(' ')[0].hash())

[98619021, 1223169806, 3649403303, 1349785232]


In [307]:
#word_hash = sentiment_df['word_hash']
#sentiment_df.drop_column('word_hash')

test_df['score'] = np.zeros(len(test_df))

word_hash = sentiment_df['word_hash']
sentiment_df.drop_column('word_hash')

idx = 0
col = words.split_column(' ')[0]

col_name = 'hash_' + str(idx)
test_df[col_name] = hash_to_series(col.lower())
sentiment_df[col_name] = word_hash
test_df = test_df.merge(sentiment_df, how='left', on=[col_name], lsuffix='', rsuffix='_r')

In [308]:
print(test_df)

      hash_0 number score  pos  neg
0   98619021      0   0.0 2009    0
1 1223169806      1   0.0    0 2009
2 -645563993      2   0.0 2009    0
3 1349785232      3   0.0 2009    0


In [309]:
test_df['score'] = test_df['score'] + test_df['pos'] - test_df['neg']
test_df['pos_'+str(idx)] = test_df['pos']
test_df.drop_column('pos')
test_df['neg_'+str(idx)] = test_df['neg']
test_df.drop_column('neg')
    
#test_df.drop_column('hash_'+str(idx)+'_r')
sentiment_df.drop_column(col_name)

NameError: column 'hash_0_r' does not exist

In [310]:
print(test_df)

      hash_0 number   score pos_0 neg_0
0   98619021      0  2009.0  2009     0
1 1223169806      1 -2009.0     0  2009
2 -645563993      2  2009.0  2009     0
3 1349785232      3  2009.0  2009     0


In [286]:
print(columns[0].hash())

[98619021, 1223169806, 3649403303, 1349785232]


In [277]:
sentiment_df.query("word_hash == 98619021")

AssertionError: 

In [223]:
word_hash = sentiment_df['word_hash']
sentiment_df.drop_column('word_hash')

test_df['score'] = np.zeros(len(test_df))

columns = words.split_column(' ')

for idx, col in enumerate(columns):
    col_name = 'hash_' + str(idx)
    test_df[col_name] = hash_to_series(col.lower())
    # cuDF joins operate on columns of the same name
    sentiment_df[col_name] = word_hash
    test_df = test_df.merge(sentiment_df, how='left', on=[col_name], lsuffix='', rsuffix='_r')
    
    test_df['score'] = test_df['score'] + test_df['pos'] - test_df['neg']
    test_df['pos_'+str(idx)] = test_df['pos']
    test_df.drop_column('pos')
    test_df['neg_'+str(idx)] = test_df['neg']
    test_df.drop_column('neg')
    
    sentiment_df.drop_column(col_name)

In [228]:
print(test_df)

  number sentiment_score    hash_0l pos_0 neg_0    hash_1l pos_1 ... neg_2
0      0             0.0   98619021     0     0      96727     0 ...     0
1      1             0.0 -601795244     0     0      97285     0 ...     0
2      2             0.0 1909191136     0     0 -602775453     0 ...     0
3      3             0.0 1640548322     0     0          0     0 ...     0
[3 more columns]


In [2]:
# https://www.kaggle.com/ehallmar/beers-breweries-and-beer-reviews#reviews.csv
beers_fn = '/raid/beer/beers.csv'
dtypes = get_dtypes(beers_fn, ',',['abv'])
for col in ['name', 'state', 'country', 'style', 'availability', 'notes']:
    dtypes[col] = 'str'
res = get_df_and_strings(beers_fn, dtypes)
beers_df = res[0]

43399004 bytes in 0.5978922843933105s: 72.58666006041132 mbytes/sec, 358873 rows


In [315]:
columns = res[1][0].split_column(' ')

def create_value_counts(split_nvstring_object):    
    # Calculate value_counts
    temp = list(map(lambda x: cudf.Series(x.hash()), split_nvstring_object))
    val_counts = cudf.multi.concat(temp).value_counts()
    return val_counts

create_value_counts(columns)

<cudf.Series nrows=116072 >

In [9]:
breweries_fn = '/raid/beer/breweries.csv'
for col in ['name', 'city', 'state', 'country', 'notes', 'types']: dtypes[col] = 'str'
res = get_df_and_strings(breweries_fn, dtypes)
breweries_df = res[0]

4184324 bytes in 0.027175188064575195s: 153.9758985312991 mbytes/sec, 50347 rows


In [2]:
beer_reviews_fn = '/raid/beer/reviews.csv'
floats = ['look', 'smell', 'feel', 'taste', 'feel', 'overall', 'score']
dtypes = get_dtypes(beer_reviews_fn, ',', floats)
for col in ['username', 'text']: dtypes[col] = 'str'
res = get_df_and_strings(beer_reviews_fn, dtypes)
beer_reviews_df = res[0]

2318668689 bytes in 3.4482719898223877s: 672.4146748990729 mbytes/sec, 9073128 rows


In [4]:
print(res[1][1].sublist([0, 1]))

['"\xa0\xa0 750 ml bottle', '\xa0\xa0']


In [56]:
print(beer_reviews_df.head())

  beer_id                    date                  look                 smell                 taste                  feel                overall                 score
0  271781 2017-03-17T00:00:00.000        201516393503.0 5.705451273030825e+18 7.411958951106261e+92 7.411972001369666e+39               707503.0          5728795633.0
1  125646 2017-12-21T00:00:00.000                   4.5                   4.5                   4.5                   4.5                    4.5                   4.5
2  125646 2017-12-21T00:00:00.000                  4.75                  4.75                  4.75                  4.75                   4.75                  4.75
3  125646 2017-12-20T00:00:00.000 5.669973100634188e+71 7.503567215803741e+18              742269.0 5.560472431977581e+16 6.6966909571065816e+16 6.215391986342381e+17
4  125646 2017-12-20T00:00:00.000                  4.25                   4.5                  4.25                  4.25                   4.25                  4.3

In [None]:
words = res[1][1].lower().split_column(' ')

In [3]:
# https://www.kaggle.com/rtatman/ubuntu-dialogue-corpus
dialogue_fn = '/raid/ubuntu/Ubuntu-dialogue-corpus/dialogueText_301.csv'

dtypes = get_dtypes(dialogue_fn, ',', floats=[])
#src user, dst user, message
for col in ['from', 'to', 'text']: dtypes[col] = 'str'

res = get_df_and_strings(dialogue_fn, dtypes)

dialogue_df = res[0]
text_strings = res[1][2]

1799936480 bytes in 2.383992910385132s: 755.009158021876 mbytes/sec, 16587830 rows
