In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set(rc={'figure.figsize': [10, 10]}, font_scale=1.3)

In [2]:
zip_codes = pd.read_csv('zip_codes.csv')

In [3]:
zip_codes.head(10)

Unnamed: 0,ZIP,2010 Census Population,FIPS Code,StCtyCode,County Name,Zip Code Designation
0,99546,* 525,2016,2016,ALEUTIANS WEST,Low Income Area/HPSA
1,99551,* 104,2050,2050,BETHEL,Low Income Area/HPSA
2,99553,"* 1,027",2013,2013,ALEUTIANS EAST,Low Income Area/HPSA
3,99554,"* 1,439",2270,2270,WADE HAMPTON,Low Income Area/HPSA
4,99555,* 219,2070,2070,DILLINGHAM,Low Income Area/HPSA
5,99720,* 32,2290,2290,YUKON-KOYUKUK,Low Income Area/HPSA
6,99721,* 324,2185,2185,NORTH SLOPE,Low Income Area/HPSA
7,99556,"* 2,512",2122,2122,KENAI PENINSULA,Low Income Area
8,99501,"* 17,943",2020,2020,ANCHORAGE,Low Income Area
9,99503,"* 14,497",2020,2020,ANCHORAGE,Low Income Area


In [4]:
zip_codes['2010 Census Population'] = zip_codes['2010 Census Population'].apply(lambda x : x.split(' ')[-1])

In [5]:
zip_codes['2010 Census Population'] = zip_codes['2010 Census Population'].apply(lambda x : x.replace(',', ''))

In [6]:
pd.to_numeric(zip_codes['2010 Census Population'])

0       525
1       104
2      1027
3      1439
4       219
       ... 
995    7926
996    2651
997     898
998    1917
999     126
Name: 2010 Census Population, Length: 1000, dtype: int64

In [7]:
zip_codes.head(10)

Unnamed: 0,ZIP,2010 Census Population,FIPS Code,StCtyCode,County Name,Zip Code Designation
0,99546,525,2016,2016,ALEUTIANS WEST,Low Income Area/HPSA
1,99551,104,2050,2050,BETHEL,Low Income Area/HPSA
2,99553,1027,2013,2013,ALEUTIANS EAST,Low Income Area/HPSA
3,99554,1439,2270,2270,WADE HAMPTON,Low Income Area/HPSA
4,99555,219,2070,2070,DILLINGHAM,Low Income Area/HPSA
5,99720,32,2290,2290,YUKON-KOYUKUK,Low Income Area/HPSA
6,99721,324,2185,2185,NORTH SLOPE,Low Income Area/HPSA
7,99556,2512,2122,2122,KENAI PENINSULA,Low Income Area
8,99501,17943,2020,2020,ANCHORAGE,Low Income Area
9,99503,14497,2020,2020,ANCHORAGE,Low Income Area


In [8]:
zip_codes.tail(10)

Unnamed: 0,ZIP,2010 Census Population,FIPS Code,StCtyCode,County Name,Zip Code Designation
990,72080,613,5141,4700,VAN BUREN,Low Income Area/HPSA
991,71949,1836,5051,4250,GARLAND,Low Income Area
992,72350,1219,5093,4460,MISSISSIPPI,Low Income Area
993,72105,10,5059,4290,HOT SPRING,Low Income Area/HPSA
994,72401,53770,5031,4150,CRAIGHEAD,Low Income Area
995,72081,7926,5145,4720,WHITE,Low Income Area
996,71749,2651,5139,4690,UNION,Low Income Area/HPSA
997,72351,898,5093,4460,MISSISSIPPI,Low Income Area
998,72082,1917,5145,4720,WHITE,Low Income Area
999,72083,126,5085,4420,LONOKE,HPSA


## Qualitative (Categorical) Data :_
#### Nominal : Country Name (won't use my_dummy as it's too big) 
#### Ordinal : Zip Code Designation (use map)

## Quantitative (Numerical) Data:_
#### Discrete   : ZIP, FIPSCode, StCtyCode, Population 
#### Continuous :

In [9]:
zip_codes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   ZIP                     1000 non-null   int64 
 1   2010 Census Population  1000 non-null   object
 2   FIPS Code               1000 non-null   int64 
 3   StCtyCode               1000 non-null   int64 
 4   County Name             1000 non-null   object
 5   Zip Code Designation    1000 non-null   object
dtypes: int64(3), object(3)
memory usage: 47.0+ KB


In [10]:
# NO NAN VAlues

In [11]:
zip_codes.describe()

Unnamed: 0,ZIP,FIPS Code,StCtyCode
count,1000.0,1000.0,1000.0
mean,55677.32,2280.218,2262.129
std,24711.235947,1695.00043,1293.851281
min,35005.0,1001.0,1000.0
25%,35959.75,1069.0,1340.0
50%,36617.5,1117.0,1580.0
75%,72523.25,5003.0,4010.0
max,99950.0,5149.0,4740.0


In [12]:
zip_codes['Zip Code Designation'].unique()

array(['Low Income Area/HPSA', 'Low Income Area', 'HPSA'], dtype=object)

In [13]:
zip_codes['Zip Code Designation'].value_counts()

Low Income Area/HPSA    736
Low Income Area         156
HPSA                    108
Name: Zip Code Designation, dtype: int64

In [14]:
zip_codes['County Name'].value_counts()

JEFFERSON        36
MOBILE           34
YUKON-KOYUKUK    24
TUSCALOOSA       22
MONTGOMERY       21
                 ..
SALINE            1
Skagway           1
JUNEAU            1
HAINES            1
SEARCY            1
Name: County Name, Length: 147, dtype: int64

In [15]:
dict_zip={'Low Income Area':1,
      'Low Income Area/HPSA':2,
      'HPSA':3
     }

In [16]:
zip_codes['Zip Code Designation']=zip_codes['Zip Code Designation'].map(dict_zip)

In [17]:
zip_codes.head()

Unnamed: 0,ZIP,2010 Census Population,FIPS Code,StCtyCode,County Name,Zip Code Designation
0,99546,525,2016,2016,ALEUTIANS WEST,2
1,99551,104,2050,2050,BETHEL,2
2,99553,1027,2013,2013,ALEUTIANS EAST,2
3,99554,1439,2270,2270,WADE HAMPTON,2
4,99555,219,2070,2070,DILLINGHAM,2


In [18]:
zip_codes=pd.get_dummies(zip_codes,columns=['County Name'],drop_first=True)
# not a good practice

In [19]:
zip_codes

Unnamed: 0,ZIP,2010 Census Population,FIPS Code,StCtyCode,Zip Code Designation,County Name_ALEUTIANS WEST,County Name_ANCHORAGE,County Name_ANGOON,County Name_ARKANSAS,County Name_ASHLEY,...,County Name_WALKER,County Name_WASHINGTON,County Name_WHITE,County Name_WILCOX,County Name_WINSTON,County Name_WOODRUFF,County Name_Wrangell City and Borough,County Name_YAKUTAT BOROUGH,County Name_YELL,County Name_YUKON-KOYUKUK
0,99546,525,2016,2016,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,99551,104,2050,2050,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,99553,1027,2013,2013,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,99554,1439,2270,2270,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,99555,219,2070,2070,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,72081,7926,5145,4720,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
996,71749,2651,5139,4690,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
997,72351,898,5093,4460,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
998,72082,1917,5145,4720,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
