In [8]:
import src.data_prep as dp
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import NMF

In [9]:
reports = dp.import_data("data/data.json")

In [10]:
df = dp.create_df(reports)

In [57]:
df.index.dtype

dtype('O')

In [4]:
df.head()

Unnamed: 0,YEAR,SEASON,MONTH,STATE,COUNTY,LOCATION DETAILS,NEAREST TOWN,NEAREST ROAD,OBSERVED,ALSO NOTICED,OTHER WITNESSES,OTHER STORIES,TIME AND CONDITIONS,ENVIRONMENT,DATE
60,1994-1997+,Spring,,Washington,Skagit County,"Up the Baker hwy., about 12 miles north from H...",Concrete,Mt.Baker Hwy,"These two gentlmen, brothers, recluses age 50'...",Feces was found and sent to a University in Mo...,,There is a ridge along their property-Doc Butt...,These events occured from May all thru June an...,200 acres of interspersed Old growth Douglas Fir.,intermittant
70,2000,Summer,July,New York,Washington County,1/2 mile into the trail to Buck Mountain,Queensbury,Shelving Rock Road,My wife and I were off the trail to take some ...,,My wife,The local Police chased a Bigfoot into the woo...,"2 pm , partly cloudy with light rain",Next to a creek 300 feet off the trail to Buck...,9
76,1997,Summer,June,Arkansas,Baxter County,THE LOCATION WAS IN THE GOVERNMENT TRACK LAND ...,Mt. Home,101 bridge,I DID NOT SEE ANYTHING. I DID HEAR A LOT. WH...,LOUD SHRILL,THRE TOTAL/FISHING,NO.,THIS INCIDENT DID TAKE PLACE ON THE EARLY MORN...,THE AREA IS A DENSE FOREST WHICH SURROUNDS A V...,25-27approx
77,1983,Fall,September,Washington,Kittitas County,"I haven't been there in years, but an area map...",Yakima,Chinook pass hyway 410,"We had driven into Milk pond, up Chinook pass,...","we thought it was weird the frogs stopped, usa...","my husband, Glenn. we had a tape recorded, bu...",,,"pine forest, you have to drive up to get to th...",?
80,1983,Summer,August,Oregon,Lane County,"It was at Kitson Springs, near the Willamette ...","Bend, Oregon",State Hwy 58,"My veterinarian sister, former girlfriend, and...",,3 persons total. We were tring to get some sl...,On another occaison I had found and photograhe...,"1:00AM dark, warm evening. clear sky.","Forest, mountains and streams",12


### Write all non-null `observed` text data to a text file

In [22]:
observed_df = df.loc[df['OBSERVED'].notnull()]['OBSERVED']

s = ""
for i in observed_df:
    s = s + i

text_file = open("data/observed_text.txt", "w", encoding="utf-8")
text_file.write(s)
text_file.close()

In [45]:
df_o = df.loc[df['OBSERVED'].notnull()]

In [24]:
cv = CountVectorizer()
cv_fit = cv.fit_transform(df_o['OBSERVED'])

In [25]:
vocab_cv = cv.get_feature_names()
vocab_count = cv_fit.toarray().sum(axis=0)

In [26]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df_o['OBSERVED'])
vocab = np.array(sorted(tfidf.vocabulary_))
V = tfidf_matrix

In [27]:
cluster = KMeans(n_clusters=6)
cluster.fit(tfidf_matrix)
centroids = cluster.cluster_centers_

In [28]:
for idx, topic in enumerate(centroids):
    print(vocab[np.argsort(-topic)[:10]])

['tracks' 'prints' 'print' 'snow' 'inches' 'foot' 'track' 'toes' 'trail'
 'footprints']
['house' 'window' 'door' 'outside' 'heard' 'dogs' 'night' 'like' 'dog'
 'went']
['saw' 'creature' 'area' 'seen' 'large' 'like' 'woods' 'tall' 'bigfoot'
 'looked']
['heard' 'sound' 'like' 'tent' 'night' 'loud' 'sounded' 'sounds' 'camp'
 'scream']
['road' 'saw' 'driving' 'car' 'creature' 'tall' 'looked' 'just' 'hair'
 'like']
['just' 'saw' 'like' 'tree' 'woods' 'river' 'looked' 'trail' 'feet' 'got']


In [46]:
labels = np.array(['Tracking', 'House', 'Seen', 'Heard', 'Driving', 'Saw'])

In [47]:
report_label = labels[np.argmin(cluster.transform(tfidf_matrix), axis=1)]

In [48]:
df_o['label_clustering'] = report_label

In [29]:
nmf = NMF(n_components=6)
nmf.fit(V)
W = nmf.transform(V)
H = nmf.components_

In [30]:
for idx, topic in enumerate(H):
    print(vocab[np.argsort(-topic)[:15]])

['tree' 'deer' 'woods' 'just' 'saw' 'like' 'river' 'walking' 'looked'
 'hunting' 'got' 'trail' 'trees' 'area' 'started']
['heard' 'sound' 'loud' 'sounds' 'like' 'sounded' 'scream' 'howl' 'noise'
 'area' 'hear' 'animal' 'minutes' 'woods' 'screams']
['tracks' 'prints' 'snow' 'inches' 'print' 'foot' 'track' 'footprints'
 'trail' 'toes' 'area' 'pictures' 'size' 'long' 'wide']
['house' 'window' 'door' 'outside' 'said' 'dog' 'night' 'dogs' 'went'
 'home' 'ran' 'yard' 'saw' 'told' 'brother']
['road' 'car' 'saw' 'driving' 'creature' 'tall' 'hair' 'looked' 'highway'
 'feet' 'crossed' 'dark' 'arms' 'brown' 'large']
['tent' 'camp' 'lake' 'night' 'camping' 'sleep' 'went' 'morning' 'got'
 'campground' 'campsite' 'time' 'set' 'decided' 'site']


In [49]:
labels = np.array(['Witnessed','Heard','Found Tracks','Saw from House','While Driving','While Camping'])

In [50]:
report_label = labels[np.argmax(W, axis=1)]

In [51]:
df_o['label_NMF'] = report_label

In [52]:
nmf.reconstruction_err_

68.43236938715371

In [53]:
df_o.to_csv('data/nlp_observed_df.csv')

In [54]:
type(df_o)

pandas.core.frame.DataFrame

# Text Generation with Pytorch

In [27]:
import torch
import torch.nn as nn
import torch.nn.functional as f

import numpy as np
from collections import Counter
import os
from argparse import Namespace

flags = Namespace(
    train_file='data/observed_text.txt',
    seq_size=32,
    batch_size=16,
    embedding_size=64,
    lstm_size=64,
    gradients_norm=5,
    initial_words=['I', 'am'],
    predict_top_k=5,
    checkpoint_path='checkpoint',
)

In [31]:
def get_data_from_file(train_file, batch_size, seq_size):
    with open(train_file, 'r', encoding="utf-8") as f:
        text = f.read()
    text = text.split()

    word_counts = Counter(text)
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
    int_to_vocab = {k: w for k, w in enumerate(sorted_vocab)}
    vocab_to_int = {w: k for k, w in int_to_vocab.items()}
    n_vocab = len(int_to_vocab)

    print('Vocabulary size', n_vocab)

    int_text = [vocab_to_int[w] for w in text]
    num_batches = int(len(int_text) / (seq_size * batch_size))
    in_text = int_text[:num_batches * batch_size * seq_size]
    out_text = np.zeros_like(in_text)
    out_text[:-1] = in_text[1:]
    out_text[-1] = in_text[0]
    in_text = np.reshape(in_text, (batch_size, -1))
    out_text = np.reshape(out_text, (batch_size, -1))
    return int_to_vocab, vocab_to_int, n_vocab, in_text, out_text


def get_batches(in_text, out_text, batch_size, seq_size):
    num_batches = np.prod(in_text.shape) // (seq_size * batch_size)
    for i in range(0, num_batches * seq_size, seq_size):
        yield in_text[:, i:i+seq_size], out_text[:, i:i+seq_size]

In [32]:
class RNNModule(nn.Module):


    def __init__(self, n_vocab, seq_size, embedding_size, lstm_size):
        super(RNNModule, self).__init__()
        self.seq_size = seq_size
        self.lstm_size = lstm_size
        self.embedding = nn.Embedding(n_vocab, embedding_size)
        self.lstm = nn.LSTM(embedding_size,
                            lstm_size,
                            batch_first=True)
        self.dense = nn.Linear(lstm_size, n_vocab)


    def forward(self, x, prev_state):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.dense(output)

        return logits, state

    
    def zero_state(self, batch_size):
        return (torch.zeros(1, batch_size, self.lstm_size),
                torch.zeros(1, batch_size, self.lstm_size))

In [33]:
def get_loss_and_train_op(net, lr=0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)

    return criterion, optimizer

def predict(device, net, words, n_vocab, vocab_to_int, int_to_vocab, top_k=5):
    net.eval()
    words = ['I', 'am']

    state_h, state_c = net.zero_state(1)
    state_h = state_h.to(device)
    state_c = state_c.to(device)
    for w in words:
        ix = torch.tensor([[vocab_to_int[w]]]).to(device)
        output, (state_h, state_c) = net(ix, (state_h, state_c))

    _, top_ix = torch.topk(output[0], k=top_k)
    choices = top_ix.tolist()
    choice = np.random.choice(choices[0])

    words.append(int_to_vocab[choice])

    for _ in range(100):
        ix = torch.tensor([[choice]]).to(device)
        output, (state_h, state_c) = net(ix, (state_h, state_c))

        _, top_ix = torch.topk(output[0], k=top_k)
        choices = top_ix.tolist()
        choice = np.random.choice(choices[0])
        words.append(int_to_vocab[choice])

    print(' '.join(words).encode('utf-8'))


def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    int_to_vocab, vocab_to_int, n_vocab, in_text, out_text = get_data_from_file(
        flags.train_file, flags.batch_size, flags.seq_size)

    net = RNNModule(n_vocab, flags.seq_size,
                    flags.embedding_size, flags.lstm_size)
    net = net.to(device)

    criterion, optimizer = get_loss_and_train_op(net, 0.01)

    iteration = 0

    for e in range(200):
        batches = get_batches(in_text, out_text, flags.batch_size, flags.seq_size)
        state_h, state_c = net.zero_state(flags.batch_size)
        state_h = state_h.to(device)
        state_c = state_c.to(device)
        for x, y in batches:
            iteration += 1
            net.train()

            optimizer.zero_grad()

            x = torch.tensor(x).to(device)
            y = torch.tensor(y).to(device)

            logits, (state_h, state_c) = net(x, (state_h, state_c))
            loss = criterion(logits.transpose(1, 2), y)

            loss_value = loss.item()

            loss.backward()

            state_h = state_h.detach()
            state_c = state_c.detach()

            _ = torch.nn.utils.clip_grad_norm_(
                net.parameters(), flags.gradients_norm)

            optimizer.step()

            if iteration % 100 == 0:
                print('Epoch: {}/{}'.format(e, 200),
                      'Iteration: {}'.format(iteration),
                      'Loss: {}'.format(loss_value))

            if iteration % 1000 == 0:
                predict(device, net, flags.initial_words, n_vocab,
                        vocab_to_int, int_to_vocab, top_k=5)
                torch.save(net.state_dict(),
                           'checkpoint_pt/model-{}.pth'.format(iteration))


if __name__ == '__main__':
    main()

Vocabulary size 73259


RuntimeError: Expected tensor for argument #1 'indices' to have scalar type Long; but got torch.cuda.IntTensor instead (while checking arguments for embedding)

In [34]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np
from collections import Counter
import os
from argparse import Namespace


flags = Namespace(
    train_file='data/observed_text.txt',
    seq_size=32,
    batch_size=16,
    embedding_size=64,
    lstm_size=64,
    gradients_norm=5,
    initial_words=['I', 'am'],
    predict_top_k=5,
    checkpoint_path='checkpoint',
)


def get_data_from_file(train_file, batch_size, seq_size):
    with open(train_file, 'r', encoding='utf-8') as f:
        text = f.read()
    text = text.split()

    word_counts = Counter(text)
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
    int_to_vocab = {k: w for k, w in enumerate(sorted_vocab)}
    vocab_to_int = {w: k for k, w in int_to_vocab.items()}
    n_vocab = len(int_to_vocab)

    print('Vocabulary size', n_vocab)

    int_text = [vocab_to_int[w] for w in text]
    num_batches = int(len(int_text) / (seq_size * batch_size))
    in_text = int_text[:num_batches * batch_size * seq_size]
    out_text = np.zeros_like(in_text)
    out_text[:-1] = in_text[1:]
    out_text[-1] = in_text[0]
    in_text = np.reshape(in_text, (batch_size, -1))
    out_text = np.reshape(out_text, (batch_size, -1))
    return int_to_vocab, vocab_to_int, n_vocab, in_text, out_text


def get_batches(in_text, out_text, batch_size, seq_size):
    num_batches = np.prod(in_text.shape) // (seq_size * batch_size)
    for i in range(0, num_batches * seq_size, seq_size):
        yield in_text[:, i:i+seq_size], out_text[:, i:i+seq_size]


class RNNModule(nn.Module):
    def __init__(self, n_vocab, seq_size, embedding_size, lstm_size):
        super(RNNModule, self).__init__()
        self.seq_size = seq_size
        self.lstm_size = lstm_size
        self.embedding = nn.Embedding(n_vocab, embedding_size)
        self.lstm = nn.LSTM(embedding_size,
                            lstm_size,
                            batch_first=True)
        self.dense = nn.Linear(lstm_size, n_vocab)

    def forward(self, x, prev_state):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.dense(output)

        return logits, state

    def zero_state(self, batch_size):
        return (torch.zeros(1, batch_size, self.lstm_size),
                torch.zeros(1, batch_size, self.lstm_size))


def get_loss_and_train_op(net, lr=0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)

    return criterion, optimizer


def predict(device, net, words, n_vocab, vocab_to_int, int_to_vocab, top_k=5):
    net.eval()
    words = ['I', 'am']

    state_h, state_c = net.zero_state(1)
    state_h = state_h.to(device)
    state_c = state_c.to(device)
    for w in words:
        ix = torch.tensor([[vocab_to_int[w]]]).to(device)
        output, (state_h, state_c) = net(ix, (state_h, state_c))

    _, top_ix = torch.topk(output[0], k=top_k)
    choices = top_ix.tolist()
    choice = np.random.choice(choices[0])

    words.append(int_to_vocab[choice])

    for _ in range(100):
        ix = torch.tensor([[choice]]).to(device)
        output, (state_h, state_c) = net(ix, (state_h, state_c))

        _, top_ix = torch.topk(output[0], k=top_k)
        choices = top_ix.tolist()
        choice = np.random.choice(choices[0])
        words.append(int_to_vocab[choice])

    print(' '.join(words).encode('utf-8'))


def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    int_to_vocab, vocab_to_int, n_vocab, in_text, out_text = get_data_from_file(
        flags.train_file, flags.batch_size, flags.seq_size)

    net = RNNModule(n_vocab, flags.seq_size,
                    flags.embedding_size, flags.lstm_size)
    net = net.to(device)

    criterion, optimizer = get_loss_and_train_op(net, 0.01)

    iteration = 0

    for e in range(200):
        batches = get_batches(in_text, out_text, flags.batch_size, flags.seq_size)
        state_h, state_c = net.zero_state(flags.batch_size)
        state_h = state_h.to(device)
        state_c = state_c.to(device)
        for x, y in batches:
            iteration += 1
            net.train()

            optimizer.zero_grad()

            x = torch.tensor(x).to(device)
            y = torch.tensor(y).to(device)

            logits, (state_h, state_c) = net(x, (state_h, state_c))
            loss = criterion(logits.transpose(1, 2), y)

            loss_value = loss.item()

            loss.backward()

            state_h = state_h.detach()
            state_c = state_c.detach()

            _ = torch.nn.utils.clip_grad_norm_(
                net.parameters(), flags.gradients_norm)

            optimizer.step()

            if iteration % 100 == 0:
                print('Epoch: {}/{}'.format(e, 200),
                      'Iteration: {}'.format(iteration),
                      'Loss: {}'.format(loss_value))

            if iteration % 1000 == 0:
                predict(device, net, flags.initial_words, n_vocab,
                        vocab_to_int, int_to_vocab, top_k=5)
                torch.save(net.state_dict(),
                           'checkpoint_pt/model-{}.pth'.format(iteration))


if __name__ == '__main__':
    main()

Vocabulary size 73259


RuntimeError: Expected tensor for argument #1 'indices' to have scalar type Long; but got torch.cuda.IntTensor instead (while checking arguments for embedding)