In [None]:
import zipfile
import os
import shutil
# archive is ~500mb, can be dowloaded by: http://download.geonames.org/export/dump/allCountries.zip
with zipfile.ZipFile("/content/drive/MyDrive/Datasets/NLP/geonames/allCountries.zip", 'r') as zf:
  zf.extractall('/content/')

In [None]:
import pandas as pd
import numpy as np
import csv
import pandas as pd
import json

def toString(x: str):
    if (x == ""):
        return None

    return str(x)


def toInt(x: str):
    if (x == ""):
        return None

    return int(x)


def toFloat(x: str):
    if (x == ""):
        return None

    return float(x)


def lineToJSON(line: str) -> dict:
    """Converts a tsv line in a geonames table to a JSON object."""
    object = {}

    values = line.split("\t")
    tableColumns = [
        ("geonameid", toString), ("name", toString), ("asciiname", toString),
        ("alternatenames", toString), ("latitude", toFloat), ("longitude", toFloat),
         ("feature class", toString), ("feature code", toString), ("country code", toString),
        ("cc2", toString), ("admin1 code", toString), ("admin2 code", toString),
        ("admin3 code", toString), ("admin4 code", toString), ("population", toInt),
         ("elevation", toInt), ("dem", toInt), ("timezone", toString), ("modification date", toString)]

    for i in range(len(values)):
        object[tableColumns[i][0]] = tableColumns[i][1](values[i])

    return object


def isMountain(geoObject: dict) -> bool:
    """MT - mountain	an elevation standing high above the surrounding area with small summit area, steep slopes and local relief of 300m or more
       MTS -	mountains	a mountain range or a group of mountains or high ridges
       PK	peak	a pointed elevation atop a mountain, ridge, or other hypsographic feature
      PKS	peaks	pointed elevations atop a mountain, ridge, or other hypsographic features
    """
    return geoObject["feature class"] == "T" and (
           geoObject["feature code"] in ["MT", "PK"])

mt_stock_columns = [
    "geonameid",
    "asciiname",
    "latitude",
    "longitude",
    "feature code",
    "elevation"
]

mt_new_columns = [
    "id",
    "name",
    "latitude",
    "longitude",
    "feature_code",
    "elevation"
]

def convertToMountain(geoObject: dict) -> list:
    """Converts a geonames geoObject to a mountain object."""
    current_row = []
    for initial_col in mt_stock_columns:
      if initial_col == "elevation":
        if geoObject["elevation"] == None:
          current_row.append(geoObject["dem"])
        else:
          current_row.append(geoObject["elevation"])
      else:
        current_row.append(geoObject[initial_col])

    return current_row

import random

def extract1500RandomMountains(inputFile: str, outputFile: str, minimumElevation=0) -> 'list[list]':
    """Extracts the mountains out of the input file."""
    mountains = []

    with open(inputFile, "r") as file:
        for line in file:
            object = lineToJSON(line.strip())
            if isMountain(object):
                mountain = convertToMountain(object)
                if int(mountain[-1]) >= minimumElevation:
                  mountains.append(mountain)

    return random.sample(mountains, 1500)


def writeMountains(mountains: 'list[list]', outputFile: str):
    """Writes the list of mountains to the output file."""
    with open(outputFile, 'w', newline='') as csvfile:
      writer = csv.writer(csvfile)
      writer.writerow(mt_new_columns)
      writer.writerows(mountains)


input_path = "/content/allCountries.txt"

# same file in repo NLP/datasets/partial_datasets/random1500Mountains.csv
output_path = "/content/random1500Mountains.csv"

mountains = extract1500RandomMountains(input_path, output_path)
writeMountains(mountains, output_path)