In [1]:
import docx
from simplify_docx import simplify

In [2]:
filename = "Analysis_12_30_21_Colorado_Fire_segments.docx"

In [4]:
def docx_to_clean_dict(docx_as_json, first_table_index=1):
  """Takes docx_as_json and cleans it up
  return: list of dicts
    {
      "time": ___,
      "location": ____,
      "station": ____,
      "text": _____________
    }
  """
  clean_data = []

  for blob in docx_as_json['VALUE'][0]['VALUE'][first_table_index:]:
      text_end = False

      if blob['TYPE'] == 'table':
          time = blob['VALUE'][0]['VALUE'][0]['VALUE'][0]['VALUE'][0]['VALUE']
          location = blob['VALUE'][0]['VALUE'][1]['VALUE'][0]['VALUE'][0]['VALUE']
          station = blob['VALUE'][0]['VALUE'][2]['VALUE'][0]['VALUE'][0]['VALUE']

      if blob['TYPE'] == 'paragraph':
          text = blob['VALUE'][0]['VALUE']
          text_end = True

      if text_end:
          clean_data.append({
              "time": time,
              "location": location,
              "station": station,
              "text": text
          })

  return clean_data



def read_docx_to_dict(filename):
    """Reads in docx file and converts it to a list of dicts"""
  # read in a document
  doc = docx.Document(filename)

  # coerce to JSON using the standard options
  docx_as_json = simplify(doc)

  blob_types = [blob['TYPE'] for blob in docx_as_json['VALUE'][0]['VALUE']]

  first_table_index = blob_types.index('table')

  return docx_to_clean_dict(docx_as_json, first_table_index)


IndentationError: unindent does not match any outer indentation level (<tokenize>, line 40)

In [None]:
data = read_docx_to_dict(filename)

In [None]:
import pandas as pd
pd.options.display.max_rows = 500

# create dataframe
df = pd.DataFrame.from_dict(data)

In [None]:
df['text'] = df['text'].str.lower()

In [None]:
df.head()

In [None]:
import time
from thefuzz import fuzz

def check_text_likeness(df, text, ratio=85, row_name='text'):
    """For a given dataframe (df), loop through the column (row_name),
    calculate the partial ratio between given text (text) and the text in each row,
    and return the indexes where the partial ratio is greater than or equal to the ratio
    """
    matches = df.apply(lambda row: (fuzz.partial_ratio(row[row_name], text) >= ratio), axis=1)
    return [i for i, x in enumerate(matches) if x]

# def extract_similar_texts(df, text, ratio=85, row_name='text'):
#     start = time.time()
#     start_time = time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())
#     print(f'start time: {start_time}')

#     check_text_likeness(df, text, ratio=85, row_name='text')

#     end = time.time()
#     minutes = (end - start)/60.0
#     end_time = time.strftime("%a, %d %b %Y %H:%M:%S", time.localtime())
#     print(f'end time: {end_time} -- took {minutes} minutes')



In [None]:
df['matches'] = df.apply(lambda row: check_text_likeness(df, row['text']), axis=1)

In [None]:
def fetch_biggest_text(idx_list):
    """For the rows with similar text, fetch the biggest text's index"""
    biggest_length = 0
    idx = None

    if len(idx_list) == 1:
        return idx_list[0]

    for i in idx_list:
        current_length = len(df['text'][i])
        if current_length > biggest_length:
            biggest_length = current_length
            idx = i
    return idx

In [None]:
df['row_to_use'] = df.apply(lambda row: fetch_biggest_text(row['matches']), axis=1)

In [None]:
def mark_use_row():
  # mark rows to use
  idxs = list(df['row_to_use'].unique())

  for index, row in df.iterrows():
      df.at[index,'use_row'] = index in idxs

  return 'done'

In [None]:
mark_use_row()

In [None]:
df.head(20)

In [None]:
df['words'] = df['text'].str.lower().str.replace(',', '').str.replace('>', '').str.replace('.', '').str.replace('\n', '').str.replace('’', "'").str.replace(
    '!', '').str.replace('?', '').str.replace('%', '').str.replace(')', '').str.replace('(', '').str.replace('_', '').str.replace(':', '').str.strip().str.split(' ')

In [None]:
df.head(20)

In [None]:
import sys
sys.path.append('../')

In [None]:
from helpers.utils import parse_words
df['clean_words'] = df.apply(lambda row: parse_words(row['words']), axis=1)

In [None]:
df.head(20)

In [None]:
from helpers.utils import fetch_climate_words_in_words, fetch_climate_phrases_in_text

In [None]:
fetch_climate_words_in_words(["adapt","for", "climate", "change"])
# segment_df['climate_words_found'] = segment_df.apply(lambda row: fetch_climate_words_in_text(row['clean_words']), axis=1)

In [None]:
fetch_climate_phrases_in_text("adapt for climate change")

In [None]:
df['climate_words_found'] = df.apply(lambda row: fetch_climate_words_in_words(row['clean_words']), axis=1)

In [None]:
df.head(20)

In [None]:
df['climate_phrases_found'] = df.apply(lambda row: fetch_climate_phrases_in_text(row['text']), axis=1)

In [None]:
df.head(20)

In [None]:
# save data to csv
df.to_csv('reports/abc_all.csv', encoding='utf-8')

In [None]:
unique_df = df[df['use_row']]

In [None]:
unique_df

In [None]:
total_words = unique_df['clean_words'].str.len().sum()
total_words

In [None]:
def words_found_master_list(df_clean_words):
    """Given a column of words, aggregate master list"""
    words_found = list()
    for chunk in df_clean_words:
        words_found += chunk

    return words_found

In [None]:
words_found = words_found_master_list(unique_df['clean_words'])
len(words_found)

In [None]:
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from wordcloud import STOPWORDS

def master_stopwords_list():
    """Creates a master list of stopwords from pre-existing stopwords found in nltk and wordcloud"""
    stop_words = set(stopwords.words("english"))
    final_stopwords = list(STOPWORDS) + list(stop_words)
    return [i.lower() for i in set(final_stopwords)]

def lemmatize_words(words):
    """Given a list of words, distill to root words"""
    lem = WordNetLemmatizer()

    lemma_list = []
    for word, tag in pos_tag(words):
        wntag = tag[0].lower()
        wntag = wntag if wntag in ['a', 'r', 'n', 'v'] else None
        if not wntag:
            lemma = word
        else:
            lemma = lem.lemmatize(word, pos=wntag)
        lemma_list.append(lemma)
    return lemma_list

def clean_lemmatized_words(lemma_words):
    """Removes stop words from the lemma list"""
    nonstop_lemma_words = []
    final_stopwords = master_stopwords_list()

    for word in lemma_words:
        if word not in final_stopwords:
            nonstop_lemma_words.append(word)

    return list(filter(None, nonstop_lemma_words))


In [None]:
clean_lemma_words = clean_lemmatized_words(lemmatize_words(words_found))


In [None]:
from nltk.probability import FreqDist

lfdist = FreqDist(clean_lemma_words)
lfdist

In [None]:
import matplotlib.pyplot as plt
lfdist.plot(30,cumulative=False)
plt.show()

In [None]:
from wordcloud import WordCloud
from wordcloud import ImageColorGenerator
from wordcloud import STOPWORDS
import matplotlib.pyplot as plt

wordcloud = WordCloud(width = 3000, height = 2000, random_state=1, background_color='black', colormap='Set2', collocations=False, stopwords = master_stopwords_list()).generate_from_frequencies(lfdist)

# Plot
plt.figure( figsize=(15,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

#plt.savefig('word_cloud.png')

In [None]:
import pandas as pd
pd.options.display.max_rows = 500
words_df = pd.DataFrame(lfdist.items(), columns=['Word', 'Count'])

words_df.sort_values(by=['Count'], ascending=False, inplace=True)
len(words_df)
# 1374 total words

words_df['Count'].sum()

# create data
climate_change_words_df = words_df.loc[words_df['Word'].isin(CLIMATE_CHANGE_RELATED_WORDS)]

climate_words_count = climate_change_words_df['Count'].sum()
non_climate_words_count = words_df['Count'].sum() - climate_words_count

comparison_df = pd.DataFrame({'Words': ['Climate-related', 'Non Climate-related'],
                             'counts': [climate_words_count, non_climate_words_count]})
comparison_df.set_index('Words', inplace=True)
print(comparison_df)

plot = comparison_df.plot.pie(y='counts', title="Climated-related vs non climated-related word frequencies", legend=True, autopct='%1.1f%%', shadow=True, figsize=(8, 8))

fig = plot.get_figure()
#fig.savefig("comparison.png")

In [None]:
# find climate related word frequencies

# set figure size
fig, ax = plt.subplots(figsize=(12, 8))
# plot horizontal bar plot
climate_change_words_df.sort_values(by='Count').plot.barh(x="Word", y="Count", ax=ax)
# set the title
plt.title("Count of climate change related words")

for i, v in enumerate(climate_change_words_df['Count'].sort_values()):
    ax.text(v, i , str(v),
            color = 'blue', fontweight = 'bold')

plt.show()
# plt.savefig('climate-related-words-breakdown.png', transparent=False)

In [None]:
# find segments
climate_change_words_found = list(climate_change_words_df['Word'].unique())
climate_change_words_found
    


In [None]:
unique_df[unique_df["climate_words_found"].str.len() != 0].to_csv('reports/abc_final.csv', encoding='utf-8')