## Loading the DataSet and finding some basic information

In [89]:
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim
import geopy
from sklearn.feature_selection import VarianceThreshold

In [70]:
df=pd.read_csv("hackathon22_osm_bang_mumbai.csv")
df

Unnamed: 0,source,poi_code,name,poi_type,lat,long,address,city,state,country,pin_code,brand
0,OSM,POI_1003_6612393918,Parle,shop.food.bakery,13.049754,77.492895,,Bangalore,Karnataka,India,,
1,OSM,POI_1003_6613030600,The Black Pearl,food.restaurant,12.939213,77.694701,,Bangalore,Karnataka,India,,
2,OSM,POI_1003_4668296373,Temple,,12.940448,77.693816,,Bangalore,Karnataka,India,,
3,OSM,POI_1003_3771995983,Kadubeesanahalli Underpass,,12.939450,77.695305,,Bangalore,Karnataka,India,,
4,OSM,POI_1003_2413350412,,health.wellbeing.swimming_pool,12.940660,77.693607,,Bangalore,Karnataka,India,,
...,...,...,...,...,...,...,...,...,...,...,...,...
42925,OSM,POI_1003_3740962178,,,12.976237,77.611640,,Bangalore,Karnataka,India,,
42926,OSM,POI_1003_3740962179,,,12.976275,77.611514,,Bangalore,Karnataka,India,,
42927,OSM,POI_1003_3740962181,,,12.976317,77.611396,,Bangalore,Karnataka,India,,
42928,OSM,POI_1003_3742106925,Soudhamani Estate NSB,,12.821388,77.511637,,Bangalore,Karnataka,India,560082,


In [5]:
df.columns

Index(['source', 'poi_code', 'name', 'poi_type', 'lat', 'long', 'address',
       'city', 'state', 'country', 'pin_code', 'brand'],
      dtype='object')

### This columns represents the following schema
1. source: source from where the data was collected
2. poi_code: unique identifier of the POI
3. name: name of the POI
4. poi_type: type of POI (e.g car dealership, shopping mall, etc)
5. lat: latitude of the POI
6. long: longitude of the POI
7. address: address of POI
8. city: city of POI
9. state: state of POI
10. country: country of POI
11. pin_code: pincode of POI
12. brand: brand information of POI


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42930 entries, 0 to 42929
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   source    42930 non-null  object 
 1   poi_code  42930 non-null  object 
 2   name      29986 non-null  object 
 3   poi_type  22401 non-null  object 
 4   lat       42930 non-null  float64
 5   long      42930 non-null  float64
 6   address   2663 non-null   object 
 7   city      42930 non-null  object 
 8   state     42930 non-null  object 
 9   country   42930 non-null  object 
 10  pin_code  7953 non-null   object 
 11  brand     1081 non-null   object 
dtypes: float64(2), object(10)
memory usage: 3.9+ MB


## Finding features having missing values

In [7]:
features_with_missing_values=[feature for feature in df.columns if df[feature].isnull().sum()>0]
features_with_missing_values

['name', 'poi_type', 'address', 'pin_code', 'brand']

### Features having missing values are
1. name
2. poi_type
3. address
4. pin_code
5. brand

## Splitting features into categorical and numerical features

In [24]:
categorical_feature=[feature for feature in df.columns if df[feature].dtype=='O']
categorical_feature

['source',
 'poi_code',
 'name',
 'poi_type',
 'address',
 'city',
 'state',
 'country',
 'pin_code',
 'brand']

In [25]:
numerical_feature=[feature for feature in df.columns if df[feature].dtype!='O']
numerical_feature

['lat', 'long']

In [73]:
def get_zipcode(df, geolocator, lat_field, lon_field):
    location = geolocator.reverse((df[lat_field], df[lon_field]))
    return location.raw['address']['postcode']


geolocator = geopy.Nominatim(user_agent='1234')
df1 = pd.DataFrame({
    'Lat': df['lat'].tolist(),
    'Lon': df['long'].tolist()
})
zipcode = df1.apply(get_zipcode, axis=1, geolocator=geolocator, lat_field='Lat', lon_field='Lon')

KeyboardInterrupt: 

## Handling Missing Values: From EDA it is evident that categorical features only have missing values so we will handle it using by adding new category "Missing"

In [74]:
#function to handle missing values after passing certain dataframe and feature name
def impute_nan(df,variable):
    """
                                Method Name: impute_nan
                                Description: handles missing values after we provide a dataframe and feature column
                                Outcome: we get a feature values without any missing values
                            

                                Written By: Saurabh Naik
                                Version: 1.0
                                Revisions: None
        """
    df["new_"+variable]=np.where(df[variable].isnull(),"Missing",df[variable])

In [77]:
features_with_missing_values=[feature for feature in df.columns if df[feature].isnull().sum()>0]
for feature in features_with_missing_values:
    impute_nan(df,feature)
df.head()

Unnamed: 0,source,poi_code,name,poi_type,lat,long,address,city,state,country,pin_code,brand,new_name,new_poi_type,new_address,new_pin_code,new_brand
0,OSM,POI_1003_6612393918,Parle,shop.food.bakery,13.049754,77.492895,,Bangalore,Karnataka,India,,,Parle,shop.food.bakery,Missing,Missing,Missing
1,OSM,POI_1003_6613030600,The Black Pearl,food.restaurant,12.939213,77.694701,,Bangalore,Karnataka,India,,,The Black Pearl,food.restaurant,Missing,Missing,Missing
2,OSM,POI_1003_4668296373,Temple,,12.940448,77.693816,,Bangalore,Karnataka,India,,,Temple,Missing,Missing,Missing,Missing
3,OSM,POI_1003_3771995983,Kadubeesanahalli Underpass,,12.93945,77.695305,,Bangalore,Karnataka,India,,,Kadubeesanahalli Underpass,Missing,Missing,Missing,Missing
4,OSM,POI_1003_2413350412,,health.wellbeing.swimming_pool,12.94066,77.693607,,Bangalore,Karnataka,India,,,Missing,health.wellbeing.swimming_pool,Missing,Missing,Missing


## Droping features having missing values

In [78]:
df.drop(features_with_missing_values,axis=1,inplace=True)

## Checking if missing values are present in the dataframe after handling that

In [79]:
len([feature for feature in df.columns if df[feature].isnull().sum()>0])

0

In [82]:
df

Unnamed: 0,source,poi_code,lat,long,city,state,country,new_name,new_poi_type,new_address,new_pin_code,new_brand
0,OSM,POI_1003_6612393918,13.049754,77.492895,Bangalore,Karnataka,India,Parle,shop.food.bakery,Missing,Missing,Missing
1,OSM,POI_1003_6613030600,12.939213,77.694701,Bangalore,Karnataka,India,The Black Pearl,food.restaurant,Missing,Missing,Missing
2,OSM,POI_1003_4668296373,12.940448,77.693816,Bangalore,Karnataka,India,Temple,Missing,Missing,Missing,Missing
3,OSM,POI_1003_3771995983,12.939450,77.695305,Bangalore,Karnataka,India,Kadubeesanahalli Underpass,Missing,Missing,Missing,Missing
4,OSM,POI_1003_2413350412,12.940660,77.693607,Bangalore,Karnataka,India,Missing,health.wellbeing.swimming_pool,Missing,Missing,Missing
...,...,...,...,...,...,...,...,...,...,...,...,...
42925,OSM,POI_1003_3740962178,12.976237,77.611640,Bangalore,Karnataka,India,Missing,Missing,Missing,Missing,Missing
42926,OSM,POI_1003_3740962179,12.976275,77.611514,Bangalore,Karnataka,India,Missing,Missing,Missing,Missing,Missing
42927,OSM,POI_1003_3740962181,12.976317,77.611396,Bangalore,Karnataka,India,Missing,Missing,Missing,Missing,Missing
42928,OSM,POI_1003_3742106925,12.821388,77.511637,Bangalore,Karnataka,India,Soudhamani Estate NSB,Missing,Missing,560082,Missing


## Finding no of unique categories in each feature

In [85]:
categorical_feature=[feature for feature in df.columns if df[feature].dtype=='O']
for col in categorical_feature:
    print(col, ': ', len(df[col].unique()), ' labels')

source :  1  labels
poi_code :  42930  labels
city :  2  labels
state :  2  labels
country :  1  labels
new_name :  23462  labels
new_poi_type :  149  labels
new_address :  2425  labels
new_pin_code :  314  labels
new_brand :  227  labels


## Converting categorical feature into numerical features

In [87]:
# let's find a key value pair for a category and its frequecy
# Then assign this key value pair to a map varible
# Then replacing the category with its frequency
for col in categorical_feature:
    df_frequency_map = df[col].value_counts().to_dict()
    df[col] = df[col].map(df_frequency_map)
df.head()

Unnamed: 0,source,poi_code,lat,long,city,state,country,new_name,new_poi_type,new_address,new_pin_code,new_brand
0,42930,1,13.049754,77.492895,30417,30417,42930,1,465,40267,34977,41849
1,42930,1,12.939213,77.694701,30417,30417,42930,1,2544,40267,34977,41849
2,42930,1,12.940448,77.693816,30417,30417,42930,46,20529,40267,34977,41849
3,42930,1,12.93945,77.695305,30417,30417,42930,1,20529,40267,34977,41849
4,42930,1,12.94066,77.693607,30417,30417,42930,12944,37,40267,34977,41849


## Creating a varience Threshold object 

In [91]:
var_thres=VarianceThreshold(threshold=0)
var_thres.fit(df)

VarianceThreshold(threshold=0)

## Finding columns having 0 varience

In [93]:
constant_columns = [column for column in df.columns
                    if column not in df.columns[var_thres.get_support()]]
for column in constant_columns:
    print(column)

source
poi_code
country


## Dropping constant columns

In [94]:
df.drop(constant_columns,axis=1,inplace=True)
df.shape

(42930, 9)

In [95]:
df.head()

Unnamed: 0,lat,long,city,state,new_name,new_poi_type,new_address,new_pin_code,new_brand
0,13.049754,77.492895,30417,30417,1,465,40267,34977,41849
1,12.939213,77.694701,30417,30417,1,2544,40267,34977,41849
2,12.940448,77.693816,30417,30417,46,20529,40267,34977,41849
3,12.93945,77.695305,30417,30417,1,20529,40267,34977,41849
4,12.94066,77.693607,30417,30417,12944,37,40267,34977,41849
