In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Dataset Overview, Visualization & California Map**

**Dataset**

The data pertains to the houses found in a given California district and some summary stats about them based on the 1990 census data. The dataset might require some preprocessing before applying prediction algorithms. 

The features are as follows: longitude, latitude, housing_median_age, total_rooms, total_bedrooms, population, households, median_income, median_house_value, and ocean_proximity.

**Tasks**

1. Check and Handle Missing Data
2. Visualization using Matplotlib and Seaborn
3. Handling Categorical Data - Ocean Proximity
4. Plotting 300 House Locations on California Map

### **Required Libraries**

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder

import matplotlib as mlt
from mpl_toolkits.basemap import Basemap
from PIL import Image
from pylab import rcParams

### **Read Dataset**

In [None]:
data=pd.read_csv('../input/california-housing-prices/housing.csv')
data.head()

In [None]:
data.tail()

### **Dataset Overview**

In [None]:
rows,cols=data.shape
print("Rows:",rows,"Cols:",cols)

In [None]:
data.info()

In [None]:
data.describe()

### **Checking for Missing Values**

In [None]:
miss=data.isnull().sum().sum()
print("Missing Data:",round((miss/rows)*100,3),"%")

In [None]:
data.isnull().sum()
#Total Bedrooms has missing values

### **Dealing with Missing Values**

In [None]:
data=data.fillna(method="ffill") 
data.isnull().sum()
#It can be handled in many ways other than ffill (Forward Fill) too

### **Visualization using Matplotlib and Seaborn**

In [None]:
sns.set_theme(style="dark")

In [None]:
sns.countplot(x="ocean_proximity",data=data,palette="prism")
plt.show()

In [None]:
sns.boxplot(x="median_house_value",data=data,color="#E11439")
plt.show()

In [None]:
plt.hist(data.median_house_value,bins=40,color='#E11439')
plt.xlabel('Median Price of Houses in a block in $')
plt.ylabel('Number of Houses')
plt.title('Average Distribution of Median Price of Housing')
plt.show()

### **Handling Categorical Data - Ocean Proximity**

In [None]:
print("Ocean Proximity Count")
data['ocean_proximity'].value_counts()

In [None]:
print("Before Encoding:",data['ocean_proximity'].unique())

Encoder=LabelEncoder()
data.ocean_proximity=Encoder.fit_transform(data.ocean_proximity)

print("After Encoding:",data['ocean_proximity'].unique())

In [None]:
print("Ocean Proximity Count")
data['ocean_proximity'].value_counts()

### **Plotting 300 House Locations on California Map**
Plotting 20640 points on a map will make it crowded and unappealing. Hence, using a sample of 300 points. Feel free to expand the same concept for the entire dataset. 

In [None]:
#Sample of 300 Houses
df=data.sample(300)
r,c=df.shape
print("R:",r,"C:",c)

In [None]:
#Only Latitude and Longitude Features are required
df=df.iloc[:,0:2]
print(*df.columns)

In [None]:
#Set Upper and Lower Limit of Latitude and Longitude
#Use OpenStreetMap Export Option
llat=32.361
ulat=42.261

llon=-125.046
ulon=-114.368

In [None]:
#Retrieve datapoints in the specified area
df=df[(df['longitude']>llon) & (df['longitude']<ulon) & 
      (df['latitude']>llat) & (df['latitude']<ulat)]
print("Number of final datapoints:",len(df))

In [None]:
#Plot the Map
my_map=Basemap(projection='merc',
            resolution = 'l', area_thresh = 1000.0,
            llcrnrlon=llon, llcrnrlat=llat,urcrnrlon=ulon, urcrnrlat=ulat)

my_map.drawcoastlines()
my_map.drawcountries()
my_map.drawlsmask(land_color='green', ocean_color='#023B56')
xs,ys = my_map(np.asarray(df.longitude), np.asarray(df.latitude))
df['x']= xs.tolist()
df['y'] =ys.tolist()

for index,row in df.iterrows():
   my_map.plot(row.x,row.y,markerfacecolor='black',
               markeredgecolor='white',marker='o',markersize=10)
plt.title("House Locations in California (N=300)")

#Download the Map
plt.savefig("ExampleMap.png", dpi=300)

In [None]:
#Display Downloaded Image
from IPython.display import Image
Image(filename='./ExampleMap.png')

### **Future Work**

Apply Machine Learning Techniques to predict median_house_value

### **Further Reading** 
* [How to Plot Data on a World Map in Python? - Medium.com](https://medium.com/analytics-vidhya/how-to-plot-data-on-a-world-map-in-python-25cf9733c3dd)
* [3 Easy Ways to Handle Categorical Data in Python - Medium.com](https://medium.com/analytics-vidhya/3-easy-ways-to-handle-categorical-data-python-b43fbd81e227)
* [Another Example: Plotting African Capitals on Map - Kaggle Notebook](https://www.kaggle.com/athisha/plotting-african-capitals-on-map)