In [2]:
# Load Spacy's pre-trained model for medium-sized English language data
import spacy
nlp = spacy.load('en_core_web_md')

# Read in the text file with the novel's content
with open('fogg.txt', 'r',encoding='utf-8') as f:
    text = f.read()

# Initialize an empty set to store the identified location entities
entities = set()

# Process the text with Spacy's NLP pipeline
doc = nlp(text)

# Loop through each entity in the processed document
for ent in doc.ents:
    # Check if the entity is labeled as a location (LOC) and starts with a capital letter
    if ent.label_ == 'LOC' and ent.text.istitle():
        # Remove any unnecessary characters from the entity text using regex
        entity_text = re.sub(r'[^\w\s]|[\d]', '', ent.text).strip()
        entity_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', entity_text)
        # Add the cleaned entity text to the set of identified entities
        if len(entity_text) > 1:
            entities.add(entity_text)

# Convert the set of entities to a Pandas DataFrame for easy analysis and visualization
df = pd.DataFrame(entities)

In [3]:
pd.set_option('display.max_rows', 500)

In [4]:
df = df.drop_duplicates()

In [5]:
df

Unnamed: 0,0
0,Europe
1,Northern
2,Platte River
3,Pacific
4,San Pablo Bay
5,Suez
6,Havre
7,Bordeaux
8,Humboldt River
9,Lake\nAsphaltite


In [6]:
from geopy.geocoders import Nominatim

# Create geolocator object
geolocator = Nominatim(user_agent="my-app")

# Define function to get location coordinates
def get_location_coordinates(location):
    try:
        # Use geolocator to get location
        location = geolocator.geocode(location)
        # Return latitude and longitude
        return (location.latitude, location.longitude)
    except:
        # If location not found or there is an error, return None
        return None

# Apply the function to the dataframe to get coordinates for each location
df['coordinates'] = df[0].apply(get_location_coordinates)

# Drop any rows where coordinates are None
df = df.dropna()

# Split coordinates into separate columns
df[['latitude', 'longitude']] = pd.DataFrame(df['coordinates'].tolist(), index=df.index)

# Drop the coordinates column
df = df.drop('coordinates', axis=1)

df = df.sort_values(by='longitude', ascending=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[['latitude', 'longitude']] = pd.DataFrame(df['coordinates'].tolist(), index=df.index)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[['latitude', 'longitude']] = pd.DataFrame(df['coordinates'].tolist(), index=df.index)


In [7]:
df

Unnamed: 0,0,latitude,longitude
44,Humboldt Mountains,-44.670229,168.224466
29,South,36.638392,127.696119
30,Shanghai River,30.778942,121.073122
3,Queenstown Harbour,1.278822,103.785394
10,Andaman,10.000105,93.000019
9,Asia,51.208697,89.234375
6,Malabar Hill,18.958162,72.803366
12,Suez,29.974498,32.537086
36,Uranus,44.422216,26.082092
20,Africa,11.502434,17.757812


In [7]:
import plotly.graph_objs as go

fig = go.Figure()

# Add trace for the points
fig.add_trace(go.Scattermapbox(
    lat=df['latitude'],
    lon=df['longitude'],
    mode='markers',
    marker=dict(size=10, color='blue'),
    name='Points'
))

# Add trace for connecting the dots
fig.add_trace(go.Scattermapbox(
    lat=df['latitude'],
    lon=df['longitude'],
    mode='lines',
    line=dict(color='red', width=2),
    hoverinfo='none'
))

# Update layout
fig.update_layout(
    mapbox=dict(
        style='open-street-map',
        center=dict(lat=df['latitude'].mean(), lon=df['longitude'].mean()),
        zoom=3
    ),
    margin=dict(l=0, r=0, t=0, b=0)
)

fig.show()