In [1]:
from sentence_transformers import SentenceTransformer
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired
from sklearn.cluster import KMeans, Birch
from sklearn.feature_extraction.text import TfidfVectorizer

pd.set_option('max_colwidth', None)
pd.set_option('display.width', None)

# 1. Load data

In [2]:
crashes = pd.read_csv('data/clean_crashes.csv', 
                      usecols=['Date', 'Location', 'Operator', 'Type', 'Aboard', 'Fatalities', 'Ground', 'Summary'])

crashes.tail(2)

Unnamed: 0,Date,Location,Operator,Type,Aboard,Fatalities,Ground,Summary
4829,2009-06-07,"Near Port Hope Simpson, Newfoundland, Canada",Strait Air,Britten-Norman BN-2A-27 Islander,1.0,1.0,0.0,The air ambulance crashed into hills while attempting to land in heavy fog about 4 miles from the airport.
4830,2009-06-08,"State of Arunachal Pradesh, India",Military - Indian Air Force,Antonov An-32,13.0,13.0,0.0,The military transport went missing while en route and might have crashed due to heavy rain in the mountainous region.


In [3]:
from spacy.lang.en.stop_words import STOP_WORDS as stop_list
# stop.update(['crashed', 'crash', 'aircraft', 'airplane', 'plane', 'pilot', 'flight', 'cargo', 'accident', 'seaplane', 'mail', 'postal', 'helicopter', 'baggage', 'cessna'])
stop_list.update(['aircraft', 'crashed', 'plane', 'flight', 'airplane', 'wreckage', 'cessna'])
stop_list = list(stop_list)

# 2. Prepare embeddings

In [4]:
# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(crashes.Summary.values, show_progress_bar=True)

Batches:   0%|          | 0/151 [00:00<?, ?it/s]

# 3. Clulster data

In [5]:
# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=300, n_components=10, min_dist=0.00, random_state=42)

# Step 3 - Cluster reduced embeddings
# hdbscan_model = HDBSCAN(min_cluster_size=30, min_samples=1, metric='euclidean', 
#                         cluster_selection_method='eom', prediction_data=True)
hdbscan_model = KMeans(n_clusters=20, max_iter=1000, random_state=42)
# hdbscan_model = Birch(n_clusters=30)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words=stop_list, min_df=10, ngram_range=(1, 2))
# vectorizer_model = TfidfVectorizer(stop_words=stop)

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True, bm25_weighting=True)

# Step 6 - (Optional) Fine-tune topic representations with 
# a `bertopic.representation` model
representation_model = KeyBERTInspired()

In [6]:
topic_model = BERTopic(

  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,
  ctfidf_model=ctfidf_model,
  representation_model=representation_model,
    
  top_n_words=10,
  verbose=True
)

topics, probs = topic_model.fit_transform(crashes.Summary.to_list(), embeddings)

2024-03-29 12:59:42,730 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-03-29 13:00:20,801 - BERTopic - Dimensionality - Completed ✓
2024-03-29 13:00:20,802 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-03-29 13:00:21,121 - BERTopic - Cluster - Completed ✓
2024-03-29 13:00:21,124 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-03-29 13:00:27,843 - BERTopic - Representation - Completed ✓


In [7]:
topic_model.get_topic_info()[['Topic', 'Count', 'Representation']]

Unnamed: 0,Topic,Count,Representation
0,0,424,"[emergency landing, exploded, shortly takeoff, landed, caught fire, explosion, crash, landing, survived, attempting emergency]"
1,1,380,"[pilot failure, safe altitude, flew, flying, mountains, altitude, mountain, mountainous terrain, mountainous, error pilot]"
2,2,359,"[cause accident, accident, failure pilot, contributing accident, stalled, maneuver, flying, caused, aborted, failure]"
3,3,356,"[pilot failure, cargo, accident, lost altitude, flew, runway attempting, contributing accident, pilot command, failure maintain, takeoff]"
4,4,356,"[engine failure, crash, emergency landing, shortly takeoff, malfunction, fatigue, broke, forced landing, failure, engines]"
5,5,338,"[crash landed, crash, mountainous terrain, mountainous, mountains, mountain, terrain, survived, poor weather, mt]"
6,6,307,"[pilot failed, landing attempt, pilot error, crew error, landing approach, failure crew, pilots, flying, disoriented, missed approach]"
7,7,305,"[crash, accident, survived, crashing, flying, pilot reported, pilots, died, jet, ran fuel]"
8,8,288,"[engine failure, cause accident, pilot failed, failure pilot, mechanical failure, emergency landing, failure captain, malfunctioning, failure, shortly takeoff]"
9,9,242,"[landing attempt, pilot attempted, runway attempting, landed, landing, struck ground, hit ground, caught fire, end runway, short runway]"


In [8]:
crashes['Cluster'] = topics

## 3.1. Get topic stats

In [None]:
index = 0
topic_model.get_topic_info(index)

In [None]:
crashes[crashes.Cluster == index][['Summary']].head(50)

# 4. Label topics

In [19]:
topic_model.set_topic_labels({19: "Mail plane", 18: "Mountains", 17: "Helicopter", 16: "Shot down", 
                              15: "Disappeared en route", 14: "Midair collision", 13: "Fog", 
                              12: "Weather conditions", 11: "Crashed into sea", 10: "Weather / landing", 
                              9: "Landing attempt", 8: "Engine failure", 7: "Crash", 6: "Pilot error", 
                              5: "En route / Mountains", 4: "Engine failure (v2)", 3: "Cargo (plane)", 
                              2: "Engine failure (v3)", 1: "Land / take off", 0: "Fire / explosion"
                             })

In [26]:
crashes['Cluster_name'] = crashes['Cluster'].apply(lambda x: topic_model.custom_labels_[x])
crashes.tail(2)

Unnamed: 0,Date,Location,Operator,Type,Aboard,Fatalities,Ground,Summary,Cluster,Cluster_name
4829,2009-06-07,"Near Port Hope Simpson, Newfoundland, Canada",Strait Air,Britten-Norman BN-2A-27 Islander,1.0,1.0,0.0,The air ambulance crashed into hills while attempting to land in heavy fog about 4 miles from the airport.,13,Fog
4830,2009-06-08,"State of Arunachal Pradesh, India",Military - Indian Air Force,Antonov An-32,13.0,13.0,0.0,The military transport went missing while en route and might have crashed due to heavy rain in the mountainous region.,15,Disappeared en route


In [27]:
crashes.to_csv("data/clean_crashes.clustered_summary.csv", sep='\t')

In [50]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,CustomName,Representation,Representative_Docs
0,0,424,0_emergency landing_exploded_shortly takeoff_landed,Fire / explosion,"[emergency landing, exploded, shortly takeoff, landed, caught fire, explosion, crash, landing, survived, attempting emergency]","[The aircraft struggled to gain altitude after taking off from Charles de Gaulle airport. The pilot reported the No. 2 engine failed and struggled to steer towards Le Bourget airfield as smoke and fire trailed the jet's left wing. The plane was unable to gain altitude, went nose high, stalled and crashed into a small hotel complex. A metal strip left on the runway by another plane gashed one of the Concorde's tires which blew out sending a piece of rubber into the underside of the wing which sent a shockwave which ruptured a seam in the fuel tank. An electrical severed by another piece of rubber sparked which ignited leaking fuel ignited and started an uncontrollable fire. Power was lost to the No. 1 and No. 2 engines which led to loss of control of the aircraft and subsequent crash. Investigation revealed the metal strip was probably a thrust reverser part from Continental Airlines DC-10-30 , registered N13067 which departed Paris as Flight 055 to Newark. This is the first crash of a Concorde in aviation history., The belly cargo pod lightly scraped the runway for about 40 feet, but the airplane transitioned to a climb. As the airplane began climbing away from the runway, the landing gear was observed to extend. The airplane climbed to about 100 to 150 feet above the ground, and then began a descending left turn. The airplane collided with the ground on a 095 degree heading. The wreckage path extended for about 300 feet, during which the landing gear, left wing, and the left engine separated from the airplane. A postcrash fire destroyed the fuselage, right wing, and the right engine. The pilot's failure to extend the landing gear, his improper aborted landing procedure, and inadvertent stall/mush. Factors in the accident were an improper adjustment of the landing gear warning horn system by company maintenance personnel, and the failure of the pilot to utilize the prelanding checklist., The aircraft exploded and caught fire 35 minutes before its scheduled takeoff. Five members of the cabin crew were the only people aboard the plane at the time. Witnesses said they heard an explosion before flames erupted aboard the aircraft. NTSB investigators reported that the center fuel tank exploded followed by the right tank 18 minutes later. The cause for the explosion was unclear. The center fuel tank is located near air conditioning packs which generate heat, and were running nonstop prior to the explosion.]"
1,1,380,1_pilot failure_safe altitude_flew_flying,Land / take off,"[pilot failure, safe altitude, flew, flying, mountains, altitude, mountain, mountainous terrain, mountainous, error pilot]","[During its descent at night and in poor weather conditions, the aircraft hit the Charleys Peak located 25 miles southeast from Alamosa. The aircraft was carrying one pilot, a flight paramediand a flight nurse to pick up a patient in Alamosa. The pilot's failure to maintain clearance from mountainous terrain. Contributing to the accident was the pilot's inadequate preflight planning, improper in-flight planning and decision making, the dark night, and the controller's failure to issue a safety alert to the pilot., Two charter planes took off Wichita for a flight to Logan, Utah via Denver. The flight was uneventful to Denver. The crew of N464M decided to fly a sceniroute the rest of the way to Logan. The aircraft, which was overloaded by 5,165 lbs. , flew into a box canyon. Confronted with steeply rising terrain, the pilot made a sharp bank trying to turn around in the narrowing valley and stalled, crashing into a mountainous forested area on Mt. Trelease. Numerous members and staff of the Wichita State Football team were killed. The pilot had only 123 total hours in a Martin 404, the copilot 30 hours. Intentional operation of the aircraft over a mountain valley route at an altitude from which the aircraft could neither climb over the obstructing terrain ahead, nor execute a successful course reversal. Other factors included overloaded conditions, lack of a flight planning and minimum qualifications of the crew., Impacted the side of a mountain at 2,300 ft., 10 miles from the destination airport. The pilot attempting visual flight in instrument meteorological conditions below the minimum safe altitude for terrain clearance. Contributing factors were the low ceiling and mountainous terrain.]"
2,2,359,2_cause accident_accident_failure pilot_contributing accident,Engine failure (v3),"[cause accident, accident, failure pilot, contributing accident, stalled, maneuver, flying, caused, aborted, failure]","[While on approach the business flight descended through IMC until the left wing contacted the ground. The plane crashed into a field and burned. The plane crashed because the pilots failed to notice the airplane had slowed to an unsafe speed and allowed the plane to stall. In addition, there was a thin layer of ice on the aircraft's wings. The flight crew's failure to effectively monitor and maintain airspeed and comply with procedures for deice boot activation on the approach, which caused an aerodynamistall from which they did not recover. Contributing to the accident was the FAA's failure to establish adequate certification requirements for flight into icing conditions, which led to the inadequate stall warning margin provided by the airplane's stall warning system., While on approach, the aircraft went into a sudden nose dive from 6,000 ft. and crashed into a wooded ravine 6 miles northwest of Pittsburgh International Airport. The accident was caused by a loss of control of the aircraft resulting from the movement of the rudder surface to its blowdown limit or an uncommanded rudder reversal. The rudder surface deflected in a direction opposite to that commanded by the pilots as a result of a jam of the main rudder PCU servo valve secondary slide to the servo valve housing offset from its neutral position and overtravel of the primary slide. The most likely sequence of events that led up to the accident included the jamming of the PCU servo valve, the application of light left rudder followed by hard right rudder which caused the rudder to reverse in the opposite direction the pilot commanded it to go. The application of hard right rudder was possibly initiated because the plane flew into the wake vortex of a B-727 which rolled the plane to the left. Rudder hardover is normally corrected with the stick (ailerons) but because the plane was flying at the crossover speed of 190 knots with flaps 1, using the stick would not correct the situation. When the right rudder was applied the rudder went to its fullblown left position causing the plane to roll further left, stall and go into a dive. Some speculate if the pilot-in-command pushed forward on the yoke to gain some speed rather than pull back, the accident possibly could have been avoided. Blame was not placed on the crew because there was no mention of this type of recovery by the manufacturer nor was there any training for such an occurrence., After climbing to 18,400' msl, the pilot reported a turbocharger problem & reversed course. He said he 'may lose the left engine' & that he was unable to maintain altitude. He diverted to an alternate airport. During a right turn onto final approach, the airplane was observed to cross (overshoot) the extended centerline of the runway. It continued in a right turn back toward the centerline, and then entered a left turn to intercept the inbound course. The turn steepened, and then the airplane entered a spin & crashed 1/2 mile short of the runway. Failure of the pilot to maintain adequate airspeed, while maneuvering on approach, which resulted in an inadvertent stall/spin and uncontrolled collision with terrain. Factors relating to the accident were: the pilot allowed the aircraft weight and balance limitations to be exceeded; the pilot's lack of recurrent training in the make and model of airplane; inadequate maintenance/inspection of the engine exhaust systems; a warped and leaking exhaust system flange on the left engine, which resulted in a loss of power in that engine; and the pilot's improper use of the flaps.]"
3,3,356,3_pilot failure_cargo_accident_lost altitude,Cargo (plane),"[pilot failure, cargo, accident, lost altitude, flew, runway attempting, contributing accident, pilot command, failure maintain, takeoff]","[The loss of control of the airplane for undetermined reasons following the in-flight opening of the improperly latched cargo door. Contributing to the accident were inadequate procedures used by Evergreen Airlines and approved by the FAA for pre-flight verification of external cargo door lock pin manual control handle, and the failure of McDonnell Douglas to provide flight crew guidance and emergency procedures for an in-flight opening of the cargo door. Also contributing to the accident was the failure of the FAA to mandate modification to the door-open warning system for DC-9 cargo-configured airplanes, given the previously known occurrences of in-flight door openings., After gaining altitude the cargo plane descended until it impacted trees. Improper IFR procedure by the first officer during takeoff, his lack of instrument scan, his failure to maintain a positive rate of climb or to identify the resultant descent, and the captain's inadequate supervision of the flight. Contributing factors were: dark night, low ceiling, drizzle, the first officer's lack of total experience in the type of operation, and possible spatial disorientation of the first officer., The cargo plane was observed in a shallow left bank until it descended and struck trees and broke up. The pilot's failure to properly configure the aircraft fuel system prior to takeoff and his failure maintain an adequate terrain clearance altitude while maneuvering to return to the airport. Factors in the accident were the dark night lighting conditions, low ceilings, restricted visibility conditions, and the pilot's diverted attention which resulted from activation of the airplane's fuel selector warning horn system.]"
4,4,356,4_engine failure_crash_emergency landing_shortly takeoff,Engine failure (v2),"[engine failure, crash, emergency landing, shortly takeoff, malfunction, fatigue, broke, forced landing, failure, engines]","[On a night repositioning flight, while en route, the stick shaker activated several times before the plane entered a aerodynamistall. Almost simultaneously both engines stopped. The crew declared an emergency but the plane did not make the airport, crashed and broke up. PC: The pilots' unprofessional behavior, deviation from standard operating procedures, and poor airmanship, which resulted in an in-flight emergency from which they were unable to recover, in part because of the pilots' inadequate training The pilots' failure to prepare for an emergency landing in a timely manner, including communicating with air trafficontrollers immediately after the emergency about the loss of both engines and the availability of landing sites and the pilots' failure to achieve and maintain the target airspeed in the double engine failure checklist, which caused the engine cores to stop rotating and resulted in the core lock engine condition. Contributing to this accident was the engine core lock condition, which prevented at least one engine from being restarted, and the airplane flight manuals that did not communicate to pilots the importance of maintaining a minimum airspeed to keep the engine cores rotating., The aircraft crashed into Santa Monica Bay shortly after a night takeoff in poor weather and visibility. The crew reported a fire warning in the No. 1 engine and shut it down. After initiating a turn, the aircraft crashed into the water at high speed. The aircraft was dispatched with one generator inoperative. While this was legal, United was required to repair the generator at the first airport where there were repair facilities. The aircraft flew for a total of 41 hours with the inoperative generator passing through airports that had the facilities to repair the generator. Shutting down the engine took the second generator offline leaving just one generator which became overloaded causing it to trip, resulting in the loss of all electrical power in the aircraft. For reasons undetermined, the battery standby switch was not turned on. At night, in rain, with no lights or instruments, the captain became disoriented and crashed into the PacifiOcean, 11.5 miles west of Los Angeles International Airport, four minutes after its initial takeoff roll. Battery backup for instruments was not required at the time. This crash prompted the FAA to require all transport category aircraft to have a standby attitude indicator which is powered by an independent source. False fire warning which prompted the shutting down of the No. 1 engine. Failure of the crew to unpower the heavy electrical loads before shutting down the No. 1 engine. Failure of UAL to repair the No. 3 generator in a timely manner., Shortly after taking off from Schiphol Airport, while climbing through 6,500 feet, the No. 3 engine separated with its pylon from the aircraft and damaged the leading edge of the right wing. The No. 3 engine separated in such a way that the No. 4 engine and pylon also separated from the wing. During an attempted return to the airport, the aircraft crashed into a 11 story building in the Bijlmermeer residential district. The design and certification of the B-747 pylon was found to be inadequate to provide the required level of safety. The system to ensure structural integrity by inspection failed. The separation of the No. 3 engine was initiated by fatigue (corrosion) in the inboard midspar fuse pin. This led to loss of the No. 4 engine and pylon and damage to several systems which ultimately led to loss of control of the aircraft.]"
5,5,338,5_crash landed_crash_mountainous terrain_mountainous,En route / Mountains,"[crash landed, crash, mountainous terrain, mountainous, mountains, mountain, terrain, survived, poor weather, mt]","[Crashed into a jungle mountainside 15 miles short of its destination while en route. The wreckage was found at the 2,500 ft. level of a 2,790 ft. mountain. Weather was poor at the time., Crashed into a mountain in poor weather while en route., Crashed into a hill in poor weather conditions while en route.]"
6,6,307,6_pilot failed_landing attempt_pilot error_crew error,Pilot error,"[pilot failed, landing attempt, pilot error, crew error, landing approach, failure crew, pilots, flying, disoriented, missed approach]","[The aircraft crashed into the Persian Gulf and exploded in flames while attempting to land at Bahrain International Airport. The crew decided to perform a missed approach after it was determined the aircraft was coming in too high and fast. Instructions were given for a 180 degree turn and climb to 2,500 feet. While performing the missed approach the plane suddenly descended rapidly from an altitude of 1,000 feet and crashed into the shallow waters of the gulf approximately 1 mile from the airport. The accident was a result of a fatal combination of factors, including the captain's failure to comply with standard operating procedures and the copilot's actions in not drawing the captain's attention to the deviations of the aircraft from the standard flight parameters. The captain may have suffered a 'spatial disorientation' to ground warning systems, which could have made him falsely perceive the aircraft was pitching up. He responded by making a nose down input, resulting in the aircraft starting to descend, when aircraft warning systems were saying he should increase altitude., After receiving a runway change, the crew executed a missed approach. They became confused on which way to go. They entered an area of clouds and heavy rain, became disoriented and descended in a left wing down attitude until they impacted the ground at a coconut plantation. The pilots experienced spatial disorientation which resulted in improper control of the aircraft. Numerous procedural errors and poor cockpit discipline by the crew. The pilot flew into weather conditions where he had no visual reference. Lack of cross-checking and monitoring of altitude. Improper monitoring the flight instruments., The turboprop crashed 4 miles south of Kirksville while on approach to Kirksville Municipal Airport. The plane clipped tree tops before crashing on its belly into a wooded area. Data show the plane descending then climbing slightly in the last four seconds before impact. The plane lacked a modern terrain warning system which would have been required equipment the next year. The pilots' failure to follow established procedures and properly conduct a nonprecision instrument approach at night in instrument meteorological conditions, including their descent below the minimum descent altitude before required visual cues were available (which continued unmoderated until the airplane struck the trees) and their failure to adhere to the established division of duties between the flying and nonflying pilots. Contributing to the accident were the pilots' failure to make standard callouts and the current Federal Aviation Regulations that allow pilots to descend below the MDA into a region in which safe obstacle clearance is not assured based upon seeing the airport approach lights. The pilots' failure to establish and maintain a professional demeanor during the flight and their fatigue likely contributed to their degraded performance.]"
7,7,305,7_crash_accident_survived_crashing,Crash,"[crash, accident, survived, crashing, flying, pilot reported, pilots, died, jet, ran fuel]","[The twin engine Cessna crashed into a densely wooded area, 30 miles south of St. Louis, 25 minutes after taking off from Parks Airport. The plane was flying in rain and fog when the pilot reported he was at 6,500 feet and having problems with the artificial horizon. Radar contact was soon lost with the aircraft. Witnesses reported hearing a plane in a dive followed by an explosion. Wreckage was scattered over a wide area. Mel Carnahan, 66, Governor of Missouri was killed in the accident along with a campaign aide and his son, Roger, who was piloting the plane., The flight was scheduled to fly from New York to St. Maarten. Because of poor visibility, the aircraft could not land at St. Maarten and was diverted to San Juan, Puerto Rico. Five minutes later the crew was told the weather had improved at St. Maarten and were directed back. After 3 missed landing attempts at St. Maarten, the crew asked to be diverted to St. Thomas. By this time, the plane was very low on fuel and the crew chose to divert to St. Croix. While trying to make St. Croix, the aircraft ran out of fuel and ditched into the sea, 35 miles from shore, sinking in 5,000 ft. of water. Improper management of fuel by the crew. Continued, unsuccessful attempts to land at St. Maarten until insufficient fuel remained to reach an alternate airport. A contributing factor was rain showers in the approaching zone not reported to the crew. Inadequate warning given to passengers before the ditching., The aircraft was struck by lightning after it entered an area of thunderstorms and heavy turbulence. The lightning caused a fire which led to the separation of the right wing and part of the left wing. The aircraft crashed in mountainous terrain in the jungle. One German passenger, a female teenager, survived and was found after trekking through the jungle for 9 days. Juliane Koepcke had a broken collarbone and was unconscious for an unknown amount of time but had survived the crash still strapped in her seat. When she came to, she set out in vain to find her mother. Maria Koepcke, her mother, a leading Peruvian ornithologist, was dead. Rescue crews searched for the aircraft without success. Koepcke would have to save herself. Koepcke found a stream and began nine days of wading through knee-high water and fighting off swarms of insects and leeches. On the ninth day, she found a canoe and shelter. Then she waited. Hours later, local lumbermen returned and found her. They tried to get her to eat but she was quite sick and refused. Insects had buried eggs in her skin and they were beginning to hatch. One of the men poured gasoline on her and, as she told the London Daily Mail, 'I counted 35 worms that came out of my arms alone.' The men offered what assistance they could provide but it was too late in the day to start the journey back to civilization; she slept one more night in the jungle before the men took her on the final seven hour journey via canoe down the river to a lumber station where she was airlifted to a hospital.]"
8,8,288,8_engine failure_cause accident_pilot failed_failure pilot,Engine failure,"[engine failure, cause accident, pilot failed, failure pilot, mechanical failure, emergency landing, failure captain, malfunctioning, failure, shortly takeoff]","[Crashed shortly after taking off after double engine failure., Crashed while attempting to make an emergency landing after experiencing an engine failure. Failure of the port engine due to accessory drive failure. Failure of the starboard engine due to seizure as a result of deficient maintenance. Error on the part of the crew in lowering the landing gear in emergency conditions., Crashed while attemting an emergency return to the airport shortly after taking off. Engine failure. The failure of the captain to maintain height and a safe flying speed when approaching to land on one engine after the failure of the left engine for reasons unknown.]"
9,9,242,9_landing attempt_pilot attempted_runway attempting_landed,Landing attempt,"[landing attempt, pilot attempted, runway attempting, landed, landing, struck ground, hit ground, caught fire, end runway, short runway]","[The jetliner slid off Runway 13C at Chicago's Midway Airport, went through an airport boundary fence and crashed into two vehicles at a nearby intersection, killing a young 6- year-old boy. Heavy snow was falling at the time of the accident. The plane had circled the airport for 30 to 35 minutes before attempting to land. The pilots' failure to use available reverse thrust in a timely manner to safely slow or stop the airplane after landing, which resulted in a runway overrun. This failure occurred because the pilots' first experience and lack of familiarity with the airplane's autobrake system distracted them from thrust reverser usage during the challenging landing. Contributing to the accident were Southwest Airline's 1) failure to provide its pilots with clear and consistent guidance and training regarding company policies and procedures related to arrival landing distance calculations; 2) programming and design of its onboard performance computer, which did not present inherent assumptions in the program critical to pilot decision making; 3) plan to implement new autobrake procedures without a familiarization period; and 4) failure to include a margin of safety in the arrival assessment to account for operational uncertainties. Also contributing to the accident was the pilots' failure to divert to another airport given reports that included poor braking action and a tailwind component greater than 5 knots. Contributing to the severity of the accident was the absence of an engineering materials arresting system, which was needed because of the limited runway safety area beyond the departure end of runway 31C., The plane attempted to land at Boeing Field but aborted the landing because of low ceiling and visibility. During a second landing attempt, the aircraft touched down 2,748 feet beyond the approach end of Runway 20, ran past the end of the runway, hit an automobile killing one person, crashed into a ditch and burst into flames. Caused by the landing of the airplane too far from the approach end of a wet runway and at a speed too great to accomplish a full stop on the runway., The aircraft was making a night, VOR instrument approach to Runway 28. Visual meteorological conditions prevailed, except for some parts of the airport which were obscured by low cloud. Approaching Runway 28, the aircraft was too high to land and the pilot chose instead to enter a right downwind to land on Runway 10. At approximately 400 feet above ground level on final for Runway 10, the aircraft entered some clouds, the base of which were reported by the tower operator to be at 10 meters. The aircraft continued its descent, initially striking some trees about 400 feet short of the runway, then shearing off one of the main landing gear on a hillside road about 200 feet prior to the runway. The aircraft, under full power and nose up, began to climb, but rolled inverted and crashed at the threshold of Runway 10. The wing fuel tanks were ruptured and the fuselage, trailing fire, slid across the runway and stopped on the parallel taxiway. A fire ensued and airport fire and rescue crews put out the fire within 10 minutes. Both pilots and one passenger were killed by impact and the remaining four passengers were taken to the hospital with moderate injuries. The primary cause of the accident was the pilot's decision to descend without visual reference to the surface. A contributing factor was the negative affect on crew performance that resulted from not using oxygen at 12,000 feet during the flight from Ciudad Guayana.]"


# 5. Topic-Document Distribution

In [15]:
topic_distr, _ = topic_model.approximate_distribution(crashes.Summary.to_list(), window=8, stride=4)

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00,  8.00it/s]


In [16]:
# Visualize the topic-document distribution for a single document
index = 3
print(crashes.Summary.iloc[index])
topic_model.visualize_distribution(topic_distr[index], custom_labels=True)

The airship flew into a thunderstorm and encountered a severe downdraft crashing 20 miles north of Helgoland Island into the sea. The ship broke in two and the control car immediately sank drowning its occupants.


In [17]:
topic_model.visualize_hierarchy(custom_labels=True)

# 6. Visualization

In [9]:
reduced_embeddings = UMAP(n_neighbors=300, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

In [12]:
topic_model.visualize_documents(crashes.Summary, reduced_embeddings=reduced_embeddings, custom_labels=True)

In [11]:
topic_model.visualize_documents(crashes.Summary, reduced_embeddings=reduced_embeddings, 
                                custom_labels=True, hide_annotations=True)