In [1]:
import pandas as pd

In [2]:
cities= pd.read_csv("cities.csv")
countries= pd.read_csv("countries.csv")
states= pd.read_csv("states.csv")

cities.columns

Index(['id', 'name', 'state_id', 'state_code', 'state_name', 'country_id',
       'country_code', 'country_name', 'latitude', 'longitude', 'wikiDataId'],
      dtype='object')

In [3]:
countries.columns

Index(['id', 'name', 'iso3', 'iso2', 'numeric_code', 'phone_code', 'capital',
       'currency', 'currency_name', 'currency_symbol', 'tld', 'native',
       'region', 'region_id', 'subregion', 'subregion_id', 'nationality',
       'timezones', 'latitude', 'longitude', 'emoji', 'emojiU'],
      dtype='object')

In [4]:
states.columns

Index(['id', 'name', 'country_id', 'country_code', 'country_name',
       'state_code', 'type', 'latitude', 'longitude'],
      dtype='object')

creating merged dataset
countries+city+states

In [5]:
merged_data = pd.merge(cities, states, left_on='state_id', right_on='id', suffixes=('_city', '_state'))
merged_data = pd.merge(merged_data, countries, left_on='country_id_city', right_on='id', suffixes=('', '_country'))

# # Select the desired fields for the new dataset
new_dataset = merged_data[['name_city', 'name_state', 'country_name_city', 'latitude', 'longitude']]

# # Rename the columns for better clarity
new_dataset.columns = ['City Name', 'State Name', 'Country Name', 'Latitude', 'Longitude']

# Display the new dataset
print(new_dataset)
# merged_data.country_name_city

                  City Name         State Name Country Name  Latitude  \
0                 Ashkāsham         Badakhshan  Afghanistan      33.0   
1                  Fayzabad         Badakhshan  Afghanistan      33.0   
2                      Jurm         Badakhshan  Afghanistan      33.0   
3                   Khandūd         Badakhshan  Afghanistan      33.0   
4                 Rāghistān         Badakhshan  Afghanistan      33.0   
...                     ...                ...          ...       ...   
150568             Redcliff  Midlands Province     Zimbabwe     -20.0   
150569             Shangani  Midlands Province     Zimbabwe     -20.0   
150570             Shurugwi  Midlands Province     Zimbabwe     -20.0   
150571    Shurugwi District  Midlands Province     Zimbabwe     -20.0   
150572  Zvishavane District  Midlands Province     Zimbabwe     -20.0   

        Longitude  
0            65.0  
1            65.0  
2            65.0  
3            65.0  
4            65.0  
...

Missing values

In [6]:
# Check for missing values
missing_values = new_dataset.isna().sum()

# Print the count of missing values in each column
print("Missing Values:")
print(missing_values)

# To drop rows with missing values (if desired)
# new_dataset = new_dataset.dropna()

# To fill missing values with a specific value (if desired)
# new_dataset = new_dataset.fillna(some_value)

# Check for duplicate entries
duplicates = new_dataset.duplicated(subset=['City Name', 'State Name', 'Country Name', 'Latitude', 'Longitude'])

# Remove duplicate entries if found
new_dataset = new_dataset[~duplicates]
new_dataset



Missing Values:
City Name       0
State Name      0
Country Name    0
Latitude        0
Longitude       0
dtype: int64


Unnamed: 0,City Name,State Name,Country Name,Latitude,Longitude
0,Ashkāsham,Badakhshan,Afghanistan,33.0,65.0
1,Fayzabad,Badakhshan,Afghanistan,33.0,65.0
2,Jurm,Badakhshan,Afghanistan,33.0,65.0
3,Khandūd,Badakhshan,Afghanistan,33.0,65.0
4,Rāghistān,Badakhshan,Afghanistan,33.0,65.0
...,...,...,...,...,...
150568,Redcliff,Midlands Province,Zimbabwe,-20.0,30.0
150569,Shangani,Midlands Province,Zimbabwe,-20.0,30.0
150570,Shurugwi,Midlands Province,Zimbabwe,-20.0,30.0
150571,Shurugwi District,Midlands Province,Zimbabwe,-20.0,30.0


NLP text Processing

In [14]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('maxent_ne_chunker')
# nltk.download('words')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\shree\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping chunkers\maxent_ne_chunker.zip.


True

In [8]:
query = pd.read_csv("query.csv")
query.columns

Index(['sentence'], dtype='object')

In [9]:
# Convert the 'sentence' column to string type
query['sentence'] = query['sentence'].astype(str)

# Apply word tokenization
query['tokenized_text'] = query['sentence'].apply(word_tokenize)

# # Remove stopwords
stop_words = set(stopwords.words('english'))
query['processed_text'] = query['tokenized_text'].apply(lambda x: [word for word in x if word.lower() not in stop_words])


In [10]:
# Perform stemming or lemmatization if needed
# Example for stemming:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
query['processed_text'] = query['processed_text'].apply(lambda x: [stemmer.stem(word) for word in x])


In [17]:
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk

# Function to filter out non-place names
def filter_place_names(sentence):
    words = word_tokenize(sentence)
    tagged_words = pos_tag(words)
    named_entities = ne_chunk(tagged_words)

    place_names = []
    for tagged_tree in named_entities:
        if hasattr(tagged_tree, 'label') and tagged_tree.label() == 'GPE': # GPE stands for Geo-Political Entity
            place_names.append(' '.join(c[0] for c in tagged_tree.leaves()))

    return place_names

# Example usage with a sample sentence
sample_sentence = "I visited New York and had a great time in the city."
filtered_places = filter_place_names(sample_sentence)
print(filtered_places)

['New York']


In [19]:
# %pip install fuzzywuzzy
from fuzzywuzzy import fuzz

def fuzzy_match(query, choices, threshold=70):
    """
    Function to perform fuzzy matching on a query string against a list of choices.
    :param query: The query string to be matched.
    :param choices: List of strings to match against.
    :param threshold: Minimum similarity threshold for a match. Default is 70.
    :return: List of tuples containing the matched strings and their similarity scores.
    """
    matches = []
    for choice in choices:
        similarity = fuzz.ratio(query, choice)
        if similarity >= threshold:
            matches.append((choice, similarity))
    return matches

# Example usage
canonical_names = ['New York', 'Los Angeles', 'San Francisco', 'London', 'Paris', 'Berlin']
query = 'Nwe York'

matches = fuzzy_match(query, canonical_names)
if matches:
    print(f"Potential matches for '{query}':")
    for match, score in matches:
        print(f"{match} (Similarity: {score})")
else:
    print(f"No matches found for '{query}'.")


Potential matches for 'Nwe York':
New York (Similarity: 88)


In [1]:
from fuzzywuzzy import process

# Sample data
countries = ['United States', 'United Kingdom', 'Canada', 'Australia']
cities = ['New York', 'London', 'Toronto', 'Sydney']
states = ['New York', 'London', 'Ontario', 'New South Wales']

def fuzzy_match_with_column(query, countries, cities, states, threshold=70):
    """
    Function to perform fuzzy matching on a query string across different fields and indicate the specific column.
    :param query: The query string to be matched.
    :param countries: List of country names.
    :param cities: List of city names.
    :param states: List of state names.
    :param threshold: Minimum similarity threshold for a match. Default is 70.
    :return: A tuple indicating the identified word, its column, and the similarity score.
    """
    matches = []
    for col, values in {'Country': countries, 'City': cities, 'State': states}.items():
        result = process.extractOne(query, values)
        if result[1] >= threshold:
            matches.append((result[0], col, result[1]))
    return matches

# Example usage
query = 'New York'
matches = fuzzy_match_with_column(query, countries, cities, states)

if matches:
    print(f"Potential matches for '{query}':")
    for match, col, score in matches:
        print(f"'{match}' found in the {col} column (Similarity: {score})")
else:
    print(f"No matches found for '{query}'.")


Potential matches for 'New York':
'New York' found in the City column (Similarity: 100)
'New York' found in the State column (Similarity: 100)




In [5]:
import pandas as pd
dir(pd)
# list(pd)
print(pd)

<module 'pandas' from 'C:\\Users\\shree\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\pandas\\__init__.py'>
