In [None]:
import tensorflow
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.layers import LSTM
import tensorflow as tf

In [None]:
import pandas as pd
import sklearn
import random
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn import metrics
import nltk
import string
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
import re
from nltk.stem.porter import *

nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
df = pd.read_csv('nuforc_reports.csv')

In [None]:
df.info()

In [None]:
df.isnull().sum()

# Tokenizing and Vectorizing Text

In [None]:
df[df['text'].isnull()]

In [None]:
#replace missing text with summary
df['text']= df['text'].fillna(value = df['summary'])

In [None]:
#drop rows where there's no summary or text
reports = df.dropna(axis=0, how='all', subset=['text','summary'])

In [None]:
reports.isnull().sum()

In [None]:
#replace missing text with summary
reports['text'].fillna(value = reports['summary'], inplace=True)

In [None]:
reports['text'].isnull().sum()

In [None]:
reports.isnull().sum()

In [None]:
def remove_punctuation (text):
  
  new_text = "".join([char for char in text if char not in string.punctuation])
  return new_text

reports['clean_text'] = reports['text'].astype(str).apply(lambda x:remove_punctuation(x))


In [None]:
def tokenize (text):
  tokens = re.split('\W+', text)
  return tokens [1:]

reports['tokenized_text'] = reports['clean_text'].apply(lambda x: tokenize(x))


In [None]:
stopword = nltk.corpus.stopwords.words('english')

def remove_stopwords(tokenized_text):
  text = [word for word in tokenized_text if word not in stopword]
  return text

reports['nostop_text']= reports['tokenized_text'].apply(lambda x:remove_stopwords(x))


### Creating new dataframe with vectors

In [None]:
reports['date_time'] = pd.to_datetime(reports['date_time'])

In [None]:
# Separate the year, month, day, hour, and minute from our datetime data.

reports['year'] = reports['date_time'].map(lambda x: x.year)
reports['month'] = reports['date_time'].map(lambda x: x.month)
reports['day'] = reports['date_time'].map(lambda x: x.day)
reports['hour'] = reports['date_time'].map(lambda x: x.hour)
reports['minute'] = reports['date_time'].map(lambda x: x.minute)

In [None]:
# Parsing duration for parsable entries
def extract_duration(text):
    if(type(text) != 'str'):
        text = str(text)
    if(text == ""):
        return -1
    elif(not any(char.isdigit() for char in text)):
        return -1
    else:
        # first numeral value
        start_idx = None
        end_idx = None
        units_idx = None
        for i in range(len(text)):
            if(text[i].isdigit()):
                start_idx = i
#                 print("start_idx="+str(start_idx))
                break
#         print("end of loop start_idx:"+str(start_idx))
        for i in range(start_idx,len(text)):
            if(not text[i].isdigit()):
                end_idx = i
#                 print("text["+str(i)+"]="+str(text[i]))
#                 print("end_idx="+str(end_idx))
                break
        if(end_idx == None):
            return -1
        duration_str = text[start_idx:end_idx]
        print("duration_str="+str(duration_str))
        duration = int(duration_str)
        for i in range(end_idx,end_idx+min(5,len(text)-end_idx)):
            if(text[i] == "m" or text[i] == "M" or text[i] == "s" or text[i] == "S" or text[i] == "h" or text[i] == "H"):
                units_idx = i
                break
        if(units_idx == None):
            return -1
        if(text[units_idx] == "h" or text[units_idx] == "H"):
            return duration*3600
        if(text[units_idx] == "m" or text[units_idx] == "M"):
            return duration*60
        if(text[units_idx] == "s" or text[units_idx] == "S"):
            return duration

reports['duration_parsed'] = reports['duration'].apply(lambda x:extract_duration(x))

In [None]:
# Separate location data into regions
reports['citystate'] = reports['city'] + "!" + reports['state']

def region_separate(citystate):
    pacific = ['WA','OR','CA']
    rockies = ['NV','ID','MT','WY','UT','CO']
    southwest = ['AZ','NM','TX','OK']
    midwest = ['ND','SD','NE','KS','MN','IA','MO','WI','IL','IN','MI','OH']
    southeast = ['AR','LA','TN','MS','KY','AL','FL','GA','SC','NC','VA','WV','MD','DE']
    northeast = ['PA','NY','NJ','CT','RI','MA','VT','NH','ME']
    noncontiguous = ['AK','HI']
    north_canada = ['YT','NU','NT']
    british_columbia = ['BC']
    prarie_provinces = ['AB','SK','MB']
    ontario = ['ON']
    quebec = ['QC']
    atlantic_provinces = ['NL','PE','NB','NS']
    
    try:
        parsed = citystate.split("!")
        if(parsed[1] in pacific):
            return "pacific"
        elif(parsed[1] in rockies):
            return "rockies"
        elif(parsed[1] in southwest):
            return "southwest"
        elif(parsed[1] in midwest):
            return "midwest"
        elif(parsed[1] in southeast):
            return "southeast"
        elif(parsed[1] in northeast):
            return "northeast"
        elif(parsed[1] in noncontiguous):
            return "noncontiguous"
        elif(parsed[1] in north_canada):
            return "north_canada"
        elif(parsed[1] in british_columbia):
            return "british_columbia"
        elif(parsed[1] in prarie_provinces):
            return "prarie_provinces"
        elif(parsed[1] in ontario):
            return "ontario"
        elif(parsed[1] in quebec):
            return "quebec"
        elif(parsed[1] in atlantic_provinces):
            return "atlantic_provinces"
        else:
            if("UK/England" in parsed[0]):
                return "UK"
            else:
                return "other"
    except:
        return "other"
        
reports['region'] = reports['citystate'].apply(lambda x: region_separate(x))

In [None]:
# Grouping similar shapes
reports_shpgroup = reports.replace({'shape' : {'disk': 'circle','unknown':'other','teardrop':'oval',
        'egg': 'oval','fireball': 'sphere', 'delta':'triangle', 'pyramid':'triangle', 'cigar':'cylinder',
       'round': 'sphere', 'changed':'changing', 'flare':'light'}})

In [None]:
# One-hot encode our categorical variables.

cat_cols = ['city', 'state', 'shape', 'region']

for col in cat_cols:
    temp = pd.get_dummies(reports_shpgroup[col], prefix=col)
    reports_shpgroup = pd.concat([reports_shpgroup, temp], axis=1)

In [None]:
# Prep for tokenization/vectorization
texts=[" ".join(text) for text in reports_shpgroup['nostop_text'].values]

In [None]:
# Text vectorization at the word level 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
 
#word level tf-idf
Tfidf_vect = TfidfVectorizer(max_features=6000)

Tfidf_vect.fit(texts)
tfidf_vals = Tfidf_vect.transform(texts)

In [None]:
# Convert sparse matrix vectors to dataframe and join original dataframe
vectors_df = pd.DataFrame.sparse.from_spmatrix(tfidf_vals)

In [None]:
reports_wvectors = pd.concat([reports_shpgroup, vectors_df], axis = 1)

In [None]:
reports_shpgroup.columns[0:40]

In [None]:
# Drop deprecated or unnecessary columns
reports_final = reports_wvectors.drop(columns = ['summary', 'city', 'state', 'date_time', 'shape', 'duration', 'stats',
                                                   'report_link', 'text', 'posted', 'region',
                                                   'clean_text', 'tokenized_text', 'nostop_text', 'citystate'])

In [None]:
reports_final.shape

# Predicting Shape from Text: Naive Bayes

In [None]:
shape_df = reports[pd.notnull(reports['shape'])]

In [None]:
shape_df.head()

In [None]:
shape_df.isnull().sum()

In [None]:
shape_df['shape'].value_counts()

In [None]:
# Grouping similar shapes
new_shape_df = shape_df.replace({'shape' : {'disk': 'circle','unknown':'other','teardrop':'oval',
        'egg': 'oval','fireball': 'sphere', 'delta':'triangle', 'pyramid':'triangle', 'cigar':'cylinder',
       'round': 'sphere', 'changed':'changing', 'flare':'light'}})

In [None]:
new_shape_df['shape'].value_counts()

In [None]:
texts=[" ".join(text) for text in new_shape_df['nostop_text'].values]

In [None]:
Encoder = LabelEncoder()
y= Encoder.fit_transform(new_shape_df['shape'])

In [None]:
num_classes = len(np.unique(y))

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(texts, 
                                                    y, 
                                                    test_size=0.2, 
                                                    random_state=1)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

#TEXT REPRESENTATION
#word level tf-idf
Tfidf_vect = TfidfVectorizer(max_features=6000)

Tfidf_vect.fit(texts)
x_train_tfidf = Tfidf_vect.transform(x_train)
x_test_tfidf = Tfidf_vect.transform(x_test)


# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(texts)
X_train_tfidf_ngram =  tfidf_vect_ngram.transform(x_train)
X_test_tfidf_ngram =  tfidf_vect_ngram.transform(x_test)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(texts)
X_train_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(x_train) 
X_test_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(x_test) 


from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(stop_words="english", analyzer='word', 
                            ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=4)
X_counts = count_vect.fit(texts)
x_train_count =  count_vect.transform(x_train)
x_test_count =  count_vect.transform(x_test)

In [None]:
x_train_tfidf

In [None]:
tfidf_sparse_df = pd.DataFrame.sparse.from_spmatrix(x_train_tfidf)

In [None]:
# fit the training dataset on the NB classifier
from sklearn.naive_bayes import MultinomialNB

# Naive Bayes on Count Vectors
def NaiveBayes(X_train, Y_train, X_test, Y_test, vectorizer):
    Naive = naive_bayes.MultinomialNB()
    Naive.fit(X_train,Y_train)
    predictions_NB = Naive.predict(X_test)
    probs = Naive.predict_proba(X_test)
    probability = np.amax(probs, 1)
    print(vectorizer,"Accuracy Score -> ",accuracy_score(predictions_NB, Y_test)*100)
    print(metrics.classification_report(Y_test, predictions_NB))
    summary = pd.DataFrame({'%':probability,'Prediction':predictions_NB})
    print(summary)


NaiveBayes(x_train_count, y_train, x_test_count, y_test, 'Count')
NaiveBayes(x_train_tfidf, y_train, x_test_tfidf, y_test, 'Word')
NaiveBayes(X_train_tfidf_ngram, Y_train, X_test_tfidf_ngram, Y_test, 'Ngram')
NaiveBayes(X_train_tfidf_ngram_chars, Y_train, X_test_tfidf_ngram_chars, Y_test, 'Ngram Char')




# UMAP Clustering for Vectors

In [None]:
vectors_binary = reports_final[reports_final.columns[-6000:]]

In [None]:
vectors_binary = vectors_binary.fillna(0)

In [None]:
import umap
reducer = umap.UMAP()

In [None]:
reports_embedding = umap.UMAP(n_neighbors = 5
                                      #,min_dist = ???
                                      #,n_components = ???,
                                      ,metric = 'hamming'
                                      ).fit_transform(vectors_binary[vectors_binary.columns[:-10]])

In [None]:
%matplotlib inline
sns.set(style='white', rc={'figure.figsize':(25,25)})

In [None]:
plt.scatter(reports_embedding[:,0], reports_embedding[:,1], c = vectors_binary['clust_grp'])

# K-means clustering for vectors

In [None]:
vectors_binary = reports_final[reports_final.columns[-6000:]]

In [None]:
vectors_binary = vectors_binary.fillna(0)

In [None]:
# Make k-means clusterer
from sklearn.cluster import KMeans
clusterer = KMeans(4, random_state=1)

In [None]:
# Fit clusterer
clusterer.fit(vectors_binary[-6000:])

In [None]:
# Predict values
vectors_binary['clust_grp'] = clusterer.predict(vectors_binary)

In [None]:
reports_wclusters = pd.concat([reports, vectors_binary['clust_grp']], axis = 1)

In [None]:
reports_wclusters_reduced = reports_wclusters[reports_wclusters['city_latitude'] > 10]

In [None]:
reports_wclusters_reduced = reports_wclusters_reduced[reports_wclusters_reduced['city_longitude'] < -40]

In [None]:
# Let's recreate this scatterplot with plotly
import plotly.express as px
fig = px.scatter(reports_wclusters_reduced, x='city_longitude', y='city_latitude', color="clust_grp"
                 #,size= 'duration'
                 #,hover_data=['Name']
                )

fig.update_layout(
    title="Text Vector Clusters by Latitude/Longitude",
    xaxis_title="Latitude",
    yaxis_title="Longitude",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="#7f7f7f"
    )
)
    
fig.show()


In [None]:
reports_clusters_novectors = pd.concat([reports, vectors_binary['clust_grp']], axis = 1)

In [None]:
reports_clusters_novectors.to_csv(r'reports_clusters_novectors.csv')