In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

In [2]:
df = pd.DataFrame({
    'Part': ['Monitor', 'CPU', None, 'Mouse', None, 'Extensions', 'Table', 'Chair', 'Wifi'],
    'Feature': ['LED', 'i7', 'RGB', 'Wireless', 'Zebronics', None, 'Urban Clap', 'Apex Chairs', 'Airtel'],
    'Price': [12000, 3500, None, 1200, None, 250, 7000, 12000, 799]
})
df

Unnamed: 0,Part,Feature,Price
0,Monitor,LED,12000.0
1,CPU,i7,3500.0
2,,RGB,
3,Mouse,Wireless,1200.0
4,,Zebronics,
5,Extensions,,250.0
6,Table,Urban Clap,7000.0
7,Chair,Apex Chairs,12000.0
8,Wifi,Airtel,799.0


In [3]:
df.isna() # where and all we have true there we have Null values

Unnamed: 0,Part,Feature,Price
0,False,False,False
1,False,False,False
2,True,False,True
3,False,False,False
4,True,False,True
5,False,True,False
6,False,False,False
7,False,False,False
8,False,False,False


In [5]:
df.isna().sum()

Part       2
Feature    1
Price      2
dtype: int64

In [6]:
df.dropna()

Unnamed: 0,Part,Feature,Price
0,Monitor,LED,12000.0
1,CPU,i7,3500.0
3,Mouse,Wireless,1200.0
6,Table,Urban Clap,7000.0
7,Chair,Apex Chairs,12000.0
8,Wifi,Airtel,799.0


In [8]:
df.dtypes

Part        object
Feature     object
Price      float64
dtype: object

In [9]:
df.replace(to_replace="12000.0",value="13,000")
df

Unnamed: 0,Part,Feature,Price
0,Monitor,LED,12000.0
1,CPU,i7,3500.0
2,,RGB,
3,Mouse,Wireless,1200.0
4,,Zebronics,
5,Extensions,,250.0
6,Table,Urban Clap,7000.0
7,Chair,Apex Chairs,12000.0
8,Wifi,Airtel,799.0


# **About this Dataset**
### Background
A building permit is an official approval document issued by a governmental agency that allows you or your contractor to proceed with a construction or remodeling project on one's property. For more details go to https://www.thespruce.com/what-is-a-building-permit-1398344. Each city or county has its own office related to buildings, that can do multiple functions like issuing permits, inspecting buildings to enforce safety measures, modifying rules to accommodate needs of the growing population etc. For the city of San Francisco, permit issuing is taken care by www.sfdbi.org/

Why is this important: In the recent past, several posts and blogs highlighted that main discrepancy in demand and supply in real estate industry is due to delays in issuing building permits. Refer:
https://www.trulia.com/blog/trends/elasticity-2016/ - Introduces concept of elasticity, and nice scatterplot of various cities. A good data story!
https://biv.com/article/2014/11/city-building-permit-delays-costing-developers-tim

### Content
The data was downloaded for the dates ranging from Jan 1st, 2013 to Feb 25th, 2018 using the filter in San Francisco open data portal. This is the exact link: https://data.sfgov.org/Housing-and-Buildings/Building-Permits/i98e-djp9/data
There are 43 columns and close to 200k records in the downloaded version (kept here). Description is separately uploaded as dictionary.

In [11]:
# read in all our data
sf_permits = pd.read_csv("Building_Permits.csv",low_memory=False)

# set seed for reproducibility
np.random.seed(0) 

In [18]:
sf_permits.head()

Unnamed: 0,Permit Number,Permit Type,Permit Type Definition,Permit Creation Date,Block,Lot,Street Number,Street Number Suffix,Street Name,Street Suffix,...,Existing Construction Type,Existing Construction Type Description,Proposed Construction Type,Proposed Construction Type Description,Site Permit,Supervisor District,Neighborhoods - Analysis Boundaries,Zipcode,Location,Record ID
0,201505065519,4,sign - erect,05/06/2015,326,23,140,,Ellis,St,...,3.0,constr type 3,,,,3.0,Tenderloin,94102.0,"(37.785719256680785, -122.40852313194863)",1380611233945
1,201604195146,4,sign - erect,04/19/2016,306,7,440,,Geary,St,...,3.0,constr type 3,,,,3.0,Tenderloin,94102.0,"(37.78733980600732, -122.41063199757738)",1420164406718
2,201605278609,3,additions alterations or repairs,05/27/2016,595,203,1647,,Pacific,Av,...,1.0,constr type 1,1.0,constr type 1,,3.0,Russian Hill,94109.0,"(37.7946573324287, -122.42232562979227)",1424856504716
3,201611072166,8,otc alterations permit,11/07/2016,156,11,1230,,Pacific,Av,...,5.0,wood frame (5),5.0,wood frame (5),,3.0,Nob Hill,94109.0,"(37.79595867909168, -122.41557405519474)",1443574295566
4,201611283529,6,demolitions,11/28/2016,342,1,950,,Market,St,...,3.0,constr type 3,,,,6.0,Tenderloin,94102.0,"(37.78315261897309, -122.40950883997789)",144548169992


# How many missing data points do we have?

* What percentage of the values in the dataset are missing?
* Your answer should be a number between 0 and 100.
* (If 1/4 of the values in the dataset are missing, the answer is 25.)

In [13]:
sf_permits.shape

(198900, 43)

In [17]:
total_cells = np.product(sf_permits.shape)
total_null = sf_permits.isnull().sum()
percent_missing = (total_null.sum()/total_cells)* 100
percent_missing

26.26002315058403

In [19]:
total_null

Permit Number                                  0
Permit Type                                    0
Permit Type Definition                         0
Permit Creation Date                           0
Block                                          0
Lot                                            0
Street Number                                  0
Street Number Suffix                      196684
Street Name                                    0
Street Suffix                               2768
Unit                                      169421
Unit Suffix                               196939
Description                                  290
Current Status                                 0
Current Status Date                            0
Filed Date                                     0
Issued Date                                14940
Completed Date                            101709
First Construction Document Date           14946
Structural Notification                   191978
Number of Existing S

###  Figure out why the data is missing

* Look the Columns "Street Number Suffix" and "Zipcode"  from the San Francisco Building Permits dataset. Both of these contain missing values.
* Which if either are missing because they don't exist?
* Which, if either, are missing because they weren't recorded?

In [22]:
sf_permits[['Street Number Suffix','Zipcode','Street Number']].head()

Unnamed: 0,Street Number Suffix,Zipcode,Street Number
0,,94102.0,140
1,,94102.0,440
2,,94109.0,1647
3,,94109.0,1230
4,,94102.0,950


In [23]:
sf_permits[['Street Number Suffix','Zipcode','Street Number']].tail()


## If a value in the "Street Number Suffix" column is missing, it is likely because it does not exist. If a value in the "Zipcode" column is missing, it was not recorded.

Unnamed: 0,Street Number Suffix,Zipcode,Street Number
198895,,,1228
198896,,,580
198897,,,1568
198898,,,795
198899,,,838


In [24]:
sf_permits.dropna() # Drop missing values: rows

Unnamed: 0,Permit Number,Permit Type,Permit Type Definition,Permit Creation Date,Block,Lot,Street Number,Street Number Suffix,Street Name,Street Suffix,...,Existing Construction Type,Existing Construction Type Description,Proposed Construction Type,Proposed Construction Type Description,Site Permit,Supervisor District,Neighborhoods - Analysis Boundaries,Zipcode,Location,Record ID


## Drop missing values: columns
Now try removing all the columns with empty values.
* Create a new DataFrame called sf_permits_with_na_dropped that has all of the columns with empty values removed.

In [25]:
sf_permits_with_na_dropped= sf_permits.dropna(axis=1) # we are Droping all the Null Values by Colums wise 

In [28]:
sf_permits_with_na_dropped.shape

(198900, 12)

How many columns were removed from the original sf_permits DataFrame? Use this number to set the value of the dropped_columns variable below.

In [29]:
dropped_columns = sf_permits.shape[1]-sf_permits_with_na_dropped.shape[1]
dropped_columns

31

###  Fill in missing values automatically

+ Try replacing all the `NaN`'s in the sf_permits data with the one that comes directly after it and then replacing any remaining `NaN's` with 0.
+ Set the result  to a new DataFrame sf_permits_with_na_imputed

In [30]:
sf_permits_with_na_imputed =sf_permits.fillna(method ='bfill',axis=0).fillna(0)

In [31]:
sf_permits_with_na_imputed.head()

Unnamed: 0,Permit Number,Permit Type,Permit Type Definition,Permit Creation Date,Block,Lot,Street Number,Street Number Suffix,Street Name,Street Suffix,...,Existing Construction Type,Existing Construction Type Description,Proposed Construction Type,Proposed Construction Type Description,Site Permit,Supervisor District,Neighborhoods - Analysis Boundaries,Zipcode,Location,Record ID
0,201505065519,4,sign - erect,05/06/2015,326,23,140,A,Ellis,St,...,3.0,constr type 3,1.0,constr type 1,Y,3.0,Tenderloin,94102.0,"(37.785719256680785, -122.40852313194863)",1380611233945
1,201604195146,4,sign - erect,04/19/2016,306,7,440,A,Geary,St,...,3.0,constr type 3,1.0,constr type 1,Y,3.0,Tenderloin,94102.0,"(37.78733980600732, -122.41063199757738)",1420164406718
2,201605278609,3,additions alterations or repairs,05/27/2016,595,203,1647,A,Pacific,Av,...,1.0,constr type 1,1.0,constr type 1,Y,3.0,Russian Hill,94109.0,"(37.7946573324287, -122.42232562979227)",1424856504716
3,201611072166,8,otc alterations permit,11/07/2016,156,11,1230,A,Pacific,Av,...,5.0,wood frame (5),5.0,wood frame (5),Y,3.0,Nob Hill,94109.0,"(37.79595867909168, -122.41557405519474)",1443574295566
4,201611283529,6,demolitions,11/28/2016,342,1,950,A,Market,St,...,3.0,constr type 3,1.0,constr type 1,Y,6.0,Tenderloin,94102.0,"(37.78315261897309, -122.40950883997789)",144548169992


In [33]:
sf_permits_with_na_imputed.tail()

Unnamed: 0,Permit Number,Permit Type,Permit Type Definition,Permit Creation Date,Block,Lot,Street Number,Street Number Suffix,Street Name,Street Suffix,...,Existing Construction Type,Existing Construction Type Description,Proposed Construction Type,Proposed Construction Type Description,Site Permit,Supervisor District,Neighborhoods - Analysis Boundaries,Zipcode,Location,Record ID
198895,M862628,8,otc alterations permit,12/05/2017,113,017A,1228,0,Montgomery,St,...,5.0,wood frame (5),5.0,wood frame (5),0,0.0,0,0.0,0,1489337276729
198896,201712055595,8,otc alterations permit,12/05/2017,271,014,580,0,Bush,St,...,5.0,wood frame (5),5.0,wood frame (5),0,0.0,0,0.0,0,1489462354993
198897,M863507,8,otc alterations permit,12/06/2017,4318,019,1568,0,Indiana,St,...,0.0,0,0.0,0,0,0.0,0,0.0,0,1489539379952
198898,M863747,8,otc alterations permit,12/06/2017,298,029,795,0,Sutter,St,...,0.0,0,0.0,0,0,0.0,0,0.0,0,1489608233656
198899,M864287,8,otc alterations permit,12/07/2017,160,006,838,0,Pacific,Av,...,0.0,0,0.0,0,0,0.0,0,0.0,0,1489796283803


In [32]:
sf_permits_with_na_imputed.isna().sum()

Permit Number                             0
Permit Type                               0
Permit Type Definition                    0
Permit Creation Date                      0
Block                                     0
Lot                                       0
Street Number                             0
Street Number Suffix                      0
Street Name                               0
Street Suffix                             0
Unit                                      0
Unit Suffix                               0
Description                               0
Current Status                            0
Current Status Date                       0
Filed Date                                0
Issued Date                               0
Completed Date                            0
First Construction Document Date          0
Structural Notification                   0
Number of Existing Stories                0
Number of Proposed Stories                0
Voluntary Soft-Story Retrofit   

In [34]:
sf_permits_with_na_imputed.to_csv("Final_Building_Permits.csv")