In [1]:
import cudf
import nvstrings
from librmm_cffi import librmm
import numpy as np

lines = [
 'this IS a',
 'word COUNT TEST',
 'with a few word repeats',
 'and some junk filler words'
]

test_strings = nvstrings.to_device(lines)

# lowercase all words
tmp = test_strings.lower()

# remove stop words
for word in ['some', 'junk', 'filler']:
    tmp = tmp.replace(word, '')
    
# Split strings into word columns
cols = tmp.split_column(' ')
# Note the 'None' values for lines with fewer words
for col in cols: print(col)

['this', 'aord', 'with', 'and']
['is', 'count', 'a', None]
['\x0f', 'test', 'few', None]
[None, None, 'word', None]
[None, None, 'repeats', 'words']


In [2]:
# Convert the nvstrings to a PyGDF DataFrame, one hashed word per row
test_df = cudf.dataframe.DataFrame()

for idx, col in enumerate(cols):
    word_array = librmm.device_array(col.size(), dtype=np.uint32)
    col.hash(word_array.device_ctypes_pointer.value)
    test_df['words_'+str(idx)] = cudf.Series(word_array)

# Note the '0' values, corresponding to the 'None' above
# Original string: "this is a"
print(test_df.head(1))

  words_0 words_1 words_2 words_3 words_4
0 3559070    3370      15       0       0


In [3]:
# Create a new DF with one entry per word hash
test_words_df = cudf.dataframe.DataFrame()
test_words_df['word_hash'] = cudf.multi.concat([test_df[col] for col in list(test_df.columns)])
test_words_df['word_hash'] = test_words_df['word_hash'].astype('int32')
print(test_words_df)

   word_hash
 0   3559070
 1   3000032
 2   3649734
 3     96727
 0      3370
 1  94851343
 2        97
 3         0
 0        15
 1   3556498
[10 more rows]


In [4]:
# Remember, this df still contains one entry per 'None'
len(test_words_df)

20

In [5]:
# Filter nulls, and print GPU accelerated word count results
filtered_words_df = test_words_df.query('word_hash != 0')
print(filtered_words_df['word_hash'].value_counts())

       
15    1
97    1
3370    1
96727    1
101272    1
3000032    1
3556498    1
3559070    1
3649734    1
3655434    1
[3 more rows]


In [None]:
# Now lets do the same thing, but with a LOT of words from 9 million tweets
# Obtain from https://about.twitter.com/en_us/values/elections-integrity.html#data

# nvstrings has a temporary convenience function for grabbing a single column from a CSV file
# in the data linked above, column 12 is the text of the actual tweets
%time tweets = nvstrings.from_csv('/raid/tweets/ira_tweets_csv_hashed.csv', 12)

In [11]:
# Stop words from https://gist.github.com/sebleier/554280
import urllib.request
url = "https://gist.githubusercontent.com/sebleier/554280/raw/7e0e4a1ce04c2bb7bd41089c9821dbcf6d0c786c/NLTK's%2520list%2520of%2520english%2520stopwords"

response = urllib.request.urlopen(url)
stop_words = response.read().decode('utf-8').split()

In [None]:
tweets = tweets.lower()

for word in stop_words:
    tweets = tweets.replace(word, '')
cols = tweets.split_column(' ')

df = cudf.dataframe.DataFrame()

for idx, col in enumerate(cols):
    word_array = librmm.device_array(col.size(), dtype=np.uint32)
    col.hash(word_array.device_ctypes_pointer.value)
    df['words_'+str(idx)] = cudf.Series(word_array)
    col.free()

cols.free()
tweets.free()

# Remember, split_column gives 1 column per word. 121 columns means the "wordiest" tweet had 121 spaces
print(df)

In [8]:
df_words['word_hash'] = cudf.multi.concat([df[col] for col in list(df.columns)])

CudaAPIError: [1] Call to cuMemcpyDtoD results in CUDA_ERROR_INVALID_VALUE

In [None]:
len(df_words)

In [None]:
# Filter null words
%time len(df_words.query('word_hash != 0'))

In [None]:
df_words['word_hash'].unique_count()