<a href="https://colab.research.google.com/github/rkrisanoff/geographical-information-retrieval/blob/main/GIR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install natasha
!pip install yandex-geocoder
!pip install hdbscan
!pip install folium
!pip install re
!pip install pandas

In [26]:
from pprint import pprint

In [27]:
from natasha import (
    Segmenter,
    MorphVocab,
    
    NewsEmbedding,
    NewsMorphTagger,
    NewsSyntaxParser,
    NewsNERTagger,
    
    PER,
    NamesExtractor,

    Doc
)

segmenter = Segmenter()
morph_vocab = MorphVocab()

emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
syntax_parser = NewsSyntaxParser(emb)
ner_tagger = NewsNERTagger(emb)

names_extractor = NamesExtractor(morph_vocab)

In [28]:
import glob
from google.colab import drive
drive.mount('/content/drive')
news = glob.glob(f"/content/drive/MyDrive/GIR/FONTANKA-NEWS/news/*.txt")[:15]

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [29]:
# import gdown
# Реальный датасет будет намного больше
# shared_link = 'https://drive.google.com/drive/folders/1YM3wLKR9qJg1SuTTk9UG2iISEObSNtw0?usp=share_link'
# output = 'news'
# news = gdown.download_folder(url=shared_link,output=output,remaining_ok=True,quiet=True) 

In [30]:
documents = dict()

for new_path in news:
  text = ""
  with open(new_path) as new:
    category,title,text = new.readlines()[:3]
    if title[-1:] == '\n':
      title = title[:-1]
  doc = Doc(text)
  doc.segment(segmenter) 
  doc.tag_morph(morph_tagger)
  for token in doc.tokens:
    token.lemmatize(morph_vocab)
  doc.tag_ner(ner_tagger)
  for span in doc.spans:
    span.normalize(morph_vocab)
    span.extract_fact(names_extractor)
    

  documents[title] = []
  for span in doc.spans:
    if span.type == "LOC":
      documents[title].append(span.normal)

In [None]:
for title,words in list(documents.items()):
  print("")
  print(title)
  print('!'+'-'*(len(title)-2)+'!')
  print(words)
  print('^'+'-'*(len(title)-2)+'^')

Кластеризация (пока только начало)

In [32]:
from decimal import Decimal
from yandex_geocoder import Client
# insert your api key
client = Client("")

In [49]:
texts = list(documents.values())
raw_locations_dict = dict()
undefined_locations_set = set()
for text in texts:
  for location in text:
    if location in undefined_locations_set:
      continue
    if location not in raw_locations_dict:
      try:
        coord = client.coordinates(location)
        raw_locations_dict[location] = float(coord[0]),float(coord[1])
      except Exception as e:
        print(e)
        undefined_locations_set.add(location)

Nothing found for "Буграх" not found
Nothing found for "Новосергиевке" not found
Nothing found for "Песочном" not found
Nothing found for "Коломягах" not found
Nothing found for "Хвойной" not found


In [34]:
undefined_locations_set

{'Буграх', 'Коломягах', 'Новосергиевке', 'Песочном', 'Хвойной'}

In [None]:
raw_locations_dict

In [62]:
GC_LON,GC_LAT = client.coordinates("Санкт-Петербург")
GC_LON,GC_LAT = float(GC_LON),float(GC_LAT)
radius = 3
locations_dict = {}.fromkeys(raw_locations_dict.keys())
for key in raw_locations_dict.keys():
  locations_dict[key] = raw_locations_dict[key]
for key,(lon,lat) in raw_locations_dict.items():
  if (abs(lon-GC_LON)**2 + abs(lat-GC_LAT)**2)>radius**2:
    locations_dict.pop(key)


In [None]:
locations_dict

In [148]:
import pandas as pd
loc_lons =[ ]
loc_lats=[]
loc_names=[]
for name,(lon,lat) in locations_dict.items():
  loc_lons.append(lon)
  loc_lats.append(lat)
  loc_names.append(name)
locations_df = pd.DataFrame({"NAME":loc_names,"LON":loc_lons,"LAT":loc_lats})
locations_df.to_csv(f"/content/drive/MyDrive/GIR/geodata.csv", sep=',', encoding='utf-8')

In [149]:
import matplotlib
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import pandas as pd
import numpy as np
import seaborn as sns
from tqdm import tqdm
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.datasets import make_blobs
from sklearn.neighbors import KNeighborsClassifier
from ipywidgets import interactive
from collections import defaultdict
import hdbscan
import folium
import re
cols = ['#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4',
        '#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff', 
        '#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1', 
        '#000075', '#808080']*10
sns.set(style="white")


In [150]:
best_silhouette, best_k = -1, 0
for k in tqdm(range(2, 75)):
    model = KMeans(n_clusters=k, random_state=1).fit(X)
    class_predictions = model.predict(X)
    
    curr_silhouette = silhouette_score(X, class_predictions)
    if curr_silhouette > best_silhouette:
        best_k = k
        best_silhouette = curr_silhouette
        
print(f'K={best_k}')
print(f'Silhouette Score: {best_silhouette}')

  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until
100%|██████████| 73/73 [00:15<00:00,  4.62it/s]

K=27
Silhouette Score: 0.543059596658829





In [151]:
m = folium.Map(location=[locations_df.LAT.mean(), locations_df.LON.mean()], zoom_start=9, 
               tiles='OpenStreet Map')
for _, row in locations_df.iterrows():
    folium.CircleMarker(
        location=[row.LAT, row.LON],
        radius=5,
        popup=f"{row.NAME} ({row.LAT}, {row.LON})",
        color='#1787FE',
        fill=True,
        fill_colour='#1787FE'
    ).add_to(m)

m

In [152]:
X = np.array(locations_df[['LON', 'LAT']], dtype='float64')
k = best_k
model = KMeans(n_clusters=k, random_state=2022).fit(X)
class_predictions = model.predict(X)
locations_df[f'CLUSTER_kmeans{k}'] = class_predictions
locations_df

Unnamed: 0,NAME,LON,LAT,CLUSTER_kmeans27
0,Южный Всеволожск,30.648415,59.990431,19
1,Ленинградская область,29.608975,59.337017,13
2,Колтушское шоссе,30.647885,60.023717,19
3,Ленобласть,29.608975,59.337017,13
4,Мурино,30.438578,60.051284,21
...,...,...,...,...
70,Никольское,30.788975,59.704642,11
71,Тосненский район,31.017569,59.372039,9
72,Петербургу,30.315644,59.938955,5
73,Василеостровский район,30.248045,59.941430,5


In [153]:
def create_map(df, cluster_column):
    m = folium.Map(location=[df.LAT.mean(), df.LON.mean()], zoom_start=9, tiles='OpenStreet Map')

    for _, row in df.iterrows():

        if row[cluster_column] == -1:
            cluster_colour = '#000000'
        else:
            cluster_colour = cols[row[cluster_column]]

        folium.CircleMarker(
            location= [row['LAT'], row['LON']],
            radius=5,
            popup= row[cluster_column],
            color=cluster_colour,
            fill=True,
            fill_color=cluster_colour
        ).add_to(m)
        
    return m

m = create_map(locations_df, 'CLUSTER_kmeans27')
print(f'K={k}')
print(f'Silhouette Score: {silhouette_score(X, class_predictions)}')

K=27
Silhouette Score: 0.5260787893001507


In [154]:
m