In [17]:
import src.data_prep as dp
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import NMF

In [2]:
reports = dp.import_data("data/data.json")

In [3]:
df = dp.create_df(reports)

In [57]:
df.index.dtype

dtype('O')

In [4]:
df.head()

Unnamed: 0,YEAR,SEASON,MONTH,STATE,COUNTY,LOCATION DETAILS,NEAREST TOWN,NEAREST ROAD,OBSERVED,ALSO NOTICED,OTHER WITNESSES,OTHER STORIES,TIME AND CONDITIONS,ENVIRONMENT,DATE
60,1994-1997+,Spring,,Washington,Skagit County,"Up the Baker hwy., about 12 miles north from H...",Concrete,Mt.Baker Hwy,"These two gentlmen, brothers, recluses age 50'...",Feces was found and sent to a University in Mo...,,There is a ridge along their property-Doc Butt...,These events occured from May all thru June an...,200 acres of interspersed Old growth Douglas Fir.,intermittant
70,2000,Summer,July,New York,Washington County,1/2 mile into the trail to Buck Mountain,Queensbury,Shelving Rock Road,My wife and I were off the trail to take some ...,,My wife,The local Police chased a Bigfoot into the woo...,"2 pm , partly cloudy with light rain",Next to a creek 300 feet off the trail to Buck...,9
76,1997,Summer,June,Arkansas,Baxter County,THE LOCATION WAS IN THE GOVERNMENT TRACK LAND ...,Mt. Home,101 bridge,I DID NOT SEE ANYTHING. I DID HEAR A LOT. WH...,LOUD SHRILL,THRE TOTAL/FISHING,NO.,THIS INCIDENT DID TAKE PLACE ON THE EARLY MORN...,THE AREA IS A DENSE FOREST WHICH SURROUNDS A V...,25-27approx
77,1983,Fall,September,Washington,Kittitas County,"I haven't been there in years, but an area map...",Yakima,Chinook pass hyway 410,"We had driven into Milk pond, up Chinook pass,...","we thought it was weird the frogs stopped, usa...","my husband, Glenn. we had a tape recorded, bu...",,,"pine forest, you have to drive up to get to th...",?
80,1983,Summer,August,Oregon,Lane County,"It was at Kitson Springs, near the Willamette ...","Bend, Oregon",State Hwy 58,"My veterinarian sister, former girlfriend, and...",,3 persons total. We were tring to get some sl...,On another occaison I had found and photograhe...,"1:00AM dark, warm evening. clear sky.","Forest, mountains and streams",12


In [23]:
df.loc[df['OBSERVED'].notnull()]['OBSERVED']

60       These two gentlmen, brothers, recluses age 50'...
70       My wife and I were off the trail to take some ...
76       I DID NOT SEE ANYTHING.  I DID HEAR A LOT.  WH...
77       We had driven into Milk pond, up Chinook pass,...
80       My veterinarian sister, former girlfriend, and...
                               ...                        
65943    Hello. My family and I took a trip to the Sequ...
65978    The town of strawberry is about 10 miles from ...
65984    While bow hunting in a field in Kansas, I obse...
65996    Hiking the Seven Sisters Trail from Mt Holyoke...
66003    Heard: at approx 2:15am my labrador retriever ...
Name: OBSERVED, Length: 5089, dtype: object

In [45]:
df_o = df.loc[df['OBSERVED'].notnull()]

In [24]:
cv = CountVectorizer()
cv_fit = cv.fit_transform(df_o['OBSERVED'])

In [25]:
vocab_cv = cv.get_feature_names()
vocab_count = cv_fit.toarray().sum(axis=0)

In [26]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df_o['OBSERVED'])
vocab = np.array(sorted(tfidf.vocabulary_))
V = tfidf_matrix

In [27]:
cluster = KMeans(n_clusters=6)
cluster.fit(tfidf_matrix)
centroids = cluster.cluster_centers_

In [28]:
for idx, topic in enumerate(centroids):
    print(vocab[np.argsort(-topic)[:10]])

['tracks' 'prints' 'print' 'snow' 'inches' 'foot' 'track' 'toes' 'trail'
 'footprints']
['house' 'window' 'door' 'outside' 'heard' 'dogs' 'night' 'like' 'dog'
 'went']
['saw' 'creature' 'area' 'seen' 'large' 'like' 'woods' 'tall' 'bigfoot'
 'looked']
['heard' 'sound' 'like' 'tent' 'night' 'loud' 'sounded' 'sounds' 'camp'
 'scream']
['road' 'saw' 'driving' 'car' 'creature' 'tall' 'looked' 'just' 'hair'
 'like']
['just' 'saw' 'like' 'tree' 'woods' 'river' 'looked' 'trail' 'feet' 'got']


In [46]:
labels = np.array(['Tracking', 'House', 'Seen', 'Heard', 'Driving', 'Saw'])

In [47]:
report_label = labels[np.argmin(cluster.transform(tfidf_matrix), axis=1)]

In [48]:
df_o['label_clustering'] = report_label

In [29]:
nmf = NMF(n_components=6)
nmf.fit(V)
W = nmf.transform(V)
H = nmf.components_

In [30]:
for idx, topic in enumerate(H):
    print(vocab[np.argsort(-topic)[:15]])

['tree' 'deer' 'woods' 'just' 'saw' 'like' 'river' 'walking' 'looked'
 'hunting' 'got' 'trail' 'trees' 'area' 'started']
['heard' 'sound' 'loud' 'sounds' 'like' 'sounded' 'scream' 'howl' 'noise'
 'area' 'hear' 'animal' 'minutes' 'woods' 'screams']
['tracks' 'prints' 'snow' 'inches' 'print' 'foot' 'track' 'footprints'
 'trail' 'toes' 'area' 'pictures' 'size' 'long' 'wide']
['house' 'window' 'door' 'outside' 'said' 'dog' 'night' 'dogs' 'went'
 'home' 'ran' 'yard' 'saw' 'told' 'brother']
['road' 'car' 'saw' 'driving' 'creature' 'tall' 'hair' 'looked' 'highway'
 'feet' 'crossed' 'dark' 'arms' 'brown' 'large']
['tent' 'camp' 'lake' 'night' 'camping' 'sleep' 'went' 'morning' 'got'
 'campground' 'campsite' 'time' 'set' 'decided' 'site']


In [49]:
labels = np.array(['Witnessed','Heard','Found Tracks','Saw from House','While Driving','While Camping'])

In [50]:
report_label = labels[np.argmax(W, axis=1)]

In [51]:
df_o['label_NMF'] = report_label

In [52]:
nmf.reconstruction_err_

68.43236938715371

In [53]:
df_o.to_csv('data/nlp_observed_df.csv')

In [54]:
type(df_o)

pandas.core.frame.DataFrame