In [1]:
import pandas as pd
import numpy as np
import os
import re
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


In [2]:
#Required NTKL
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
data = pd.read_csv('dataset.csv', delimiter=',', on_bad_lines='skip')

In [4]:
data.dropna(inplace=True)

In [5]:
data['area_name'].fillna(data['city'], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['area_name'].fillna(data['city'], inplace=True)


In [6]:
# prompt: remove numericals from area_name

data['area_name'] = data['area_name'].apply(lambda x: re.sub(r'\d+', '', str(x)))
data["area_name"] = data["area_name"].str.replace("/","").replace("+","").replace("-"," ").replace("  "," ").replace("-nd","")
data['area_name'] = data['area_name'].str.lstrip()

In [7]:
data.head()

Unnamed: 0,area_name,city,state,pincode,latitude,longitude
0,"Doctor Rajkumar Road, nd Stage, Rajajinagar",Bengaluru,Karnataka,560055,13.006378,77.554236
1,"Sampige Road, Malleshwaram",Bengaluru,Karnataka,560055,13.003064,77.571141
3,"KG Halli, D' Souza Layout, Sampangi Rama Nagar",Bengaluru,Karnataka,560001,12.97159,77.596058
4,"Bannerghatta Road, Arakere Gate, Omkar Nagar, ...",Bengaluru,Karnataka,560076,12.890909,77.59677
7,"rd Phase, J. P. Nagar",Bengaluru,Karnataka,560078,12.914233,77.599332


In [8]:
punctuation_signs = list("?:!.,;")
wordnet_lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

data['area_name'] = data['area_name'].str.replace("\r", "").replace("\n", " ").replace("+", "").replace("/", " ").replace("    ", " ").replace('"', '').str.lower()
for punct_sign in punctuation_signs:
    data['area_name'] = data['area_name'].str.replace(punct_sign, '')


In [9]:
data['Full_Address'] = (data['area_name'] + " " + data['city'] + " " + data['state']).str.lower()

# Create new features from the address
data['address_length'] = data['Full_Address'].apply(lambda x: len(x.split()))
data['num_special_chars'] = data['Full_Address'].apply(lambda x: sum(not c.isalnum() for c in x))

# Example of other possible features
# Count the number of numeric characters
data['num_numeric'] = data['Full_Address'].apply(lambda x: sum(c.isdigit() for c in x))



In [10]:
X = data['Full_Address']  # Feature: Full address
y = data[['latitude', 'longitude']]  # Target: Latitude and Longitude

In [11]:
data.isna().sum()

area_name            0
city                 0
state                0
pincode              0
latitude             0
longitude            0
Full_Address         0
address_length       0
num_special_chars    0
num_numeric          0
dtype: int64

In [12]:
# Check if Full_Address column exists and is in the correct form
print(data['Full_Address'].tail())


24914    khan bahadur abdul rehman road austin town nee...
24915                      rajajinagar bengaluru karnataka
24920    commercial street tasker town shivaji nagar be...
24921    dispensary road tasker town shivaji nagar beng...
24925    nd e cross road opp sanjay gandhi hospital th ...
Name: Full_Address, dtype: object


In [13]:
# prompt: data size

print(data.shape)


(15788, 10)


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1,2))

# Transform the address data into vector form
X_tfidf = tfidf_vectorizer.fit_transform(data['Full_Address'])


In [15]:
# prompt: print the shape of x and y

print(X_tfidf.shape)
print(y.shape)


(15788, 5000)
(15788, 2)


In [16]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Assuming 'y' is your target variable (latitude/longitude)
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Train Random Forest Model
rf_model = RandomForestRegressor(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Random Forest - Mean Absolute Error: {mae}")
print(f"Random Forest - Mean Squared Error: {mse}")
print(f"Random Forest - R-squared: {r2}")


Random Forest - Mean Absolute Error: 0.04161325973106338
Random Forest - Mean Squared Error: 0.05337668139892558
Random Forest - R-squared: 0.987276413895884


In [17]:
def predict_lat_long(address):
    # Vectorize the new address
    address_vectorized = tfidf_vectorizer.transform([address])
    
    # Predict latitude and longitude
    prediction = rf_model.predict(address_vectorized)
    
    return prediction[0]

# Test the function with a new address
new_address = "banashankari bengaluru karnataka"
predicted_lat_long = predict_lat_long(new_address)
print(f"Predicted Latitude and Longitude for '{new_address}': {predicted_lat_long}")

Predicted Latitude and Longitude for 'banashankari bengaluru karnataka': [12.9379496 77.5579457]
