In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
!pip install --upgrade pip
!pip install --upgrade numpy
!pip install --upgrade sentence_transformers
!conda install -c conda-forge hdbscan --y
!pip install bokeh
!pip install --upgrade bertopic[visualization]

# Airplane Crashes Dataset Since 1908

There are 5,008 accidents recorded since 1908 and 17 features in this dataset.
It contains crash date, route, flight no, aircraft type, operator, military/civilan, on board passenger (crew+passengers), how many died in crash and summary which contains brief description of the accident and cause if known. <br>


I will try to cluster summary data using BERTopic which is a topic modeling technique that leverages transformers.


### Database Format
Date: Date of accident, in the format - January 01, 2001<br>
Time: Local time, in 24 hr. format unless otherwise specified<br>
Operator: Airline or operator of the aircraft<br>
Flight #: Flight number assigned by the aircraft operator<br>
Route: Complete or partial route flown prior to the accident<br>
AC Type: Aircraft type<br>
Reg: ICAO registration of the aircraft<br>
cn / ln: Construction or serial number / Line or fuselage number<br>
Aboard: Total aboard (passengers / crew)<br>
Passengers aboard : Passengers abroad<br>
Crew aboard : Crew abroad<br>
All fatalities : Total fatalities aboard (passengers / crew)<br>
Passenger fatalities: Total Passenger fatalities<br>
Crew fatalities: Total Crew fatalities<br>
Ground: Total killed on the ground<br>
Summary: Brief description of the accident and cause if known<br>


![](https://i.imgur.com/tHiK3qb.png)

In [None]:
data=pd.read_csv('/kaggle/input/airplane-crashes-dataset-since-1908/Airplane_crashes_dataset_since_1908.csv')
data.tail()

In [None]:
from bertopic import BERTopic
import random
random.seed(42)
import warnings
warnings.filterwarnings('ignore')

from sentence_transformers import SentenceTransformer
import sklearn.manifold

from bokeh.io import output_file, show
from bokeh.models import ColumnDataSource, HoverTool, LinearColorMapper
from bokeh.palettes import plasma, d3, Turbo256
from bokeh.plotting import figure
from bokeh.transform import transform
import bokeh.io
bokeh.io.output_notebook()

import bokeh.plotting as bpl
import bokeh.models as bmo
bpl.output_notebook()

In [None]:
data = data[~data.summary.isna()]
text = data.summary.values.tolist()

model = BERTopic(language="english", nr_topics=40)
#model = BERTopic(language="english", nr_topics="auto")
topics, probs = model.fit_transform(text)

model_st = SentenceTransformer('stsb-distilbert-base')
embeddings = model_st.encode(text)
out = sklearn.manifold.TSNE(n_components=2).fit_transform(embeddings)

In [None]:
topic_words = ['-1: outlier']
for i in range(len(set(topics))-1):
  tpc = model.get_topic(i)[:7]
  words = [x[0] for x in tpc]
  tw = ' '.join([str(i) + ':'] + words)
  topic_words.append(tw)

exp_topics = [topic_words[x+1] for x in topics]

clrs = random.sample(Turbo256, len(set(topics)))
color_map = bmo.CategoricalColorMapper(factors=topic_words, palette=clrs)

In [None]:
list_x = out[:,0]
list_y = out[:,1]
desc = text

source = ColumnDataSource(data=dict(x=list_x, y=list_y, desc=desc, topic=exp_topics))
hover = HoverTool(tooltips=[
    ("index", "$index"),
    ('desc', '@desc'),
    ('topic', '@topic')
])

p = figure(plot_width=800, plot_height=800, tools=[hover], title="Crash Summary Clusters")
p.circle('x', 'y', size=10, source=source,
         fill_color=transform('topic', color_map),
         # legend='topic'
)
# p.legend.location = "top_left"
# p.legend.click_policy="hide"

bpl.show(p)

Let's list topic frequencies and their keywords

In [None]:
topic_df = model.get_topic_freq()

def get_keywords(i):
    if i == -1: return 'outlier'
    tpc = model.get_topic(i)[:7]
    words = [x[0] for x in tpc]
    tw = ' '.join(words)
    return tw

topic_df['keywords'] = topic_df['Topic'].apply(get_keywords)

topic_df

# Visualize Topics with LDAvis
We can visualize the topics that were generated in a way very similar to LDAvis.

In [None]:
model.visualize_topics()