#### Exploring

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
df = pd.read_csv('immigration_data_sample.csv', index_col=0)

In [3]:
df.head()

Unnamed: 0,cicid,i94yr,i94mon,i94cit,i94res,i94port,arrdate,i94mode,i94addr,depdate,...,entdepu,matflag,biryear,dtaddto,gender,insnum,airline,admnum,fltno,visatype
2027561,4084316.0,2016.0,4.0,209.0,209.0,HHW,20566.0,1.0,HI,20573.0,...,,M,1955.0,7202016,F,,JL,56582670000.0,00782,WT
2171295,4422636.0,2016.0,4.0,582.0,582.0,MCA,20567.0,1.0,TX,20568.0,...,,M,1990.0,10222016,M,,*GA,94362000000.0,XBLNG,B2
589494,1195600.0,2016.0,4.0,148.0,112.0,OGG,20551.0,1.0,FL,20571.0,...,,M,1940.0,7052016,M,,LH,55780470000.0,00464,WT
2631158,5291768.0,2016.0,4.0,297.0,297.0,LOS,20572.0,1.0,CA,20581.0,...,,M,1991.0,10272016,M,,QR,94789700000.0,00739,B2
3032257,985523.0,2016.0,4.0,111.0,111.0,CHM,20550.0,3.0,NY,20553.0,...,,M,1997.0,7042016,F,,,42322570000.0,LAND,WT


In [4]:
df['visapost'].head()

2027561    NaN
2171295    MTR
589494     NaN
2631158    DOH
3032257    NaN
Name: visapost, dtype: object

In [5]:
def sas_to_py_datetime(dt):
    return pd.to_timedelta(dt, unit='D') + pd.Timestamp('1960-1-1')

In [6]:
df['arrdate'] = df['arrdate'].apply(sas_to_py_datetime)

In [7]:
df['depdate'] = df['depdate'].apply(sas_to_py_datetime)

In [8]:
series_null_columns = df.isnull().sum(axis=0).sort_values(ascending=False)

In [9]:
series_null_columns = series_null_columns[series_null_columns > 0]

In [10]:
series_null_columns

entdepu     1000
occup        996
insnum       965
visapost     618
gender       141
i94addr       59
depdate       49
matflag       46
entdepd       46
airline       33
fltno          8
dtype: int64

In [11]:
df[~pd.isnull(df['occup'])]['occup']

262355     STU
1429118    PHA
1677722    OTH
2857353    STU
Name: occup, dtype: object

In [12]:
len(df['cicid'].unique())

1000

In [13]:
df['i94port'].head()

2027561    HHW
2171295    MCA
589494     OGG
2631158    LOS
3032257    CHM
Name: i94port, dtype: object

In [14]:
df.columns

Index(['cicid', 'i94yr', 'i94mon', 'i94cit', 'i94res', 'i94port', 'arrdate',
       'i94mode', 'i94addr', 'depdate', 'i94bir', 'i94visa', 'count',
       'dtadfile', 'visapost', 'occup', 'entdepa', 'entdepd', 'entdepu',
       'matflag', 'biryear', 'dtaddto', 'gender', 'insnum', 'airline',
       'admnum', 'fltno', 'visatype'],
      dtype='object')

In [15]:
np.sum(df['admnum'].isnull())

0

In [16]:
a = str(df.iloc[0]['dtadfile'])
datetime.strptime(a, '%Y%m%d')

datetime.datetime(2016, 4, 22, 0, 0)

In [17]:
def convert_int_date_to_pd(dt):
    return pd.to_datetime(str(dt), format='%Y%m%d')

In [18]:
df['dtadfile'] = df['dtadfile'].apply(convert_int_date_to_pd)

In [19]:
df['entdepa'].unique()

array(['G', 'Z', 'T', 'O', 'P', 'A', 'K', 'U', 'H'], dtype=object)

In [20]:
df['admnum'].head()

2027561    5.658267e+10
2171295    9.436200e+10
589494     5.578047e+10
2631158    9.478970e+10
3032257    4.232257e+10
Name: admnum, dtype: float64

In [21]:
df.shape

(1000, 28)

In [22]:
df[df['i94cit'] ==213.0]

Unnamed: 0,cicid,i94yr,i94mon,i94cit,i94res,i94port,arrdate,i94mode,i94addr,depdate,...,entdepu,matflag,biryear,dtaddto,gender,insnum,airline,admnum,fltno,visatype
2509117,5070551.0,2016.0,4.0,213.0,213.0,LOS,2016-04-27,1.0,CA,2016-08-10,...,,M,1952.0,10262016,M,,QR,94694860000.0,00739,B2
1451442,2929149.0,2016.0,4.0,213.0,213.0,LVG,2016-04-16,1.0,NC,2016-06-29,...,,M,1955.0,10152016,M,,LH,93722980000.0,00428,B2
897673,1840249.0,2016.0,4.0,213.0,213.0,SEA,2016-04-10,1.0,WA,2016-05-15,...,,M,1948.0,10092016,F,,EK,93178960000.0,00227,B2
2614678,5270355.0,2016.0,4.0,213.0,213.0,DAL,2016-04-28,1.0,TX,2016-05-31,...,,M,1974.0,10272016,F,,AA,94769470000.0,00071,B2
1748489,3529728.0,2016.0,4.0,213.0,213.0,SEA,2016-04-19,1.0,WA,2016-06-18,...,,M,1955.0,10182016,M,,EK,93950910000.0,00227,B2
3057502,688605.0,2016.0,4.0,213.0,213.0,EPI,2016-04-04,3.0,WA,2016-04-05,...,,M,1976.0,10032016,M,,,92764350000.0,LAND,B1
2733233,5492918.0,2016.0,4.0,213.0,213.0,PHI,2016-04-29,1.0,PA,2016-05-02,...,,M,1999.0,10282016,M,,LH,94879950000.0,00426,B2
342979,687285.0,2016.0,4.0,213.0,131.0,NEW,2016-04-04,1.0,OH,2016-04-15,...,,M,1974.0,10032016,M,,LH,92747250000.0,00402,B1
1149853,2353538.0,2016.0,4.0,213.0,213.0,ATL,2016-04-13,1.0,NJ,2016-04-18,...,,M,1998.0,10122016,F,,AI,93395870000.0,00191,B2
428793,864073.0,2016.0,4.0,213.0,213.0,SFR,2016-04-05,1.0,CA,2016-04-09,...,,M,1969.0,10042016,M,,KL,92819620000.0,00605,B1


In [23]:
df.isnull().sum(axis=0)

cicid          0
i94yr          0
i94mon         0
i94cit         0
i94res         0
i94port        0
arrdate        0
i94mode        0
i94addr       59
depdate       49
i94bir         0
i94visa        0
count          0
dtadfile       0
visapost     618
occup        996
entdepa        0
entdepd       46
entdepu     1000
matflag       46
biryear        0
dtaddto        0
gender       141
insnum       965
airline       33
admnum         0
fltno          8
visatype       0
dtype: int64

In [24]:
us_cities = pd.read_csv('us-cities-demographics.csv', sep=';')

In [25]:
us_cities.head()

Unnamed: 0,City,State,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,State Code,Race,Count
0,Silver Spring,Maryland,33.8,40601.0,41862.0,82463,1562.0,30908.0,2.6,MD,Hispanic or Latino,25924
1,Quincy,Massachusetts,41.0,44129.0,49500.0,93629,4147.0,32935.0,2.39,MA,White,58723
2,Hoover,Alabama,38.5,38040.0,46799.0,84839,4819.0,8229.0,2.58,AL,Asian,4759
3,Rancho Cucamonga,California,34.5,88127.0,87105.0,175232,5821.0,33878.0,3.18,CA,Black or African-American,24437
4,Newark,New Jersey,34.6,138040.0,143873.0,281913,5829.0,86253.0,2.73,NJ,White,76402


In [26]:
len(us_cities.columns)

12

In [27]:
us_cities.Race.unique()

array(['Hispanic or Latino', 'White', 'Asian', 'Black or African-American',
       'American Indian and Alaska Native'], dtype=object)

In [60]:
airports = pd.read_csv('airport-codes_csv.csv')

In [29]:
airports.head()

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
0,00A,heliport,Total Rf Heliport,11.0,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125"
1,00AA,small_airport,Aero B Ranch Airport,3435.0,,US,US-KS,Leoti,00AA,,00AA,"-101.473911, 38.704022"
2,00AK,small_airport,Lowell Field,450.0,,US,US-AK,Anchor Point,00AK,,00AK,"-151.695999146, 59.94919968"
3,00AL,small_airport,Epps Airpark,820.0,,US,US-AL,Harvest,00AL,,00AL,"-86.77030181884766, 34.86479949951172"
4,00AR,closed,Newport Hospital & Clinic Heliport,237.0,,US,US-AR,Newport,,,,"-91.254898, 35.6087"


In [61]:
ports_df = pd.read_csv('port_immigration.csv')

In [62]:
ports_df.city = ports_df.city.str.lower()

In [63]:
airports.municipality = airports.municipality.str.lower()

In [64]:
airports_city = ports_df.merge(airports, left_on='city', right_on='municipality')[['ident', 'code', 'city', 'state_code', 'type', 'name',\
                                                                   'elevation_ft', 'gps_code', 'iata_code', 'local_code', 'coordinates']]

In [65]:
airports_city.rename(columns={'code': 'city_code', 'ident': 'airport_id'}, inplace=True)

In [66]:
airports_city = pd.concat([airports_city, airports_city.coordinates.str.split(',', expand=True).rename(columns={0: 'longitude', 1:'latitude'})],\
         axis=1).drop('coordinates', axis=1)

In [67]:
airports_city.head()

Unnamed: 0,airport_id,city_code,city,state_code,type,name,elevation_ft,gps_code,iata_code,local_code,longitude,latitude
0,2OK,ANC,anchorage,AK,heliport,Alaska Regional Hospital Heliport,137.0,2OK,,2OK,-149.82699584960938,61.21189880371094
1,3C3,ANC,anchorage,AK,seaplane_base,Campbell Lake Seaplane Base,20.0,,,3C3,-149.942003,61.133099
2,6AK5,ANC,anchorage,AK,small_airport,Fire Island Airport,55.0,6AK5,,6AK5,-150.16099548339844,61.16830062866211
3,99AA,ANC,anchorage,AK,heliport,Aviator Hotel Anchorage Heliport,123.0,99AA,,99AA,-149.886482,61.218906
4,9AK5,ANC,anchorage,AK,small_airport,Sky Harbor Airport,340.0,9AK5,,9AK5,-149.81900024414062,61.11650085449219


In [30]:
us_cities

Unnamed: 0,City,State,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,State Code,Race,Count
0,Silver Spring,Maryland,33.8,40601.0,41862.0,82463,1562.0,30908.0,2.60,MD,Hispanic or Latino,25924
1,Quincy,Massachusetts,41.0,44129.0,49500.0,93629,4147.0,32935.0,2.39,MA,White,58723
2,Hoover,Alabama,38.5,38040.0,46799.0,84839,4819.0,8229.0,2.58,AL,Asian,4759
3,Rancho Cucamonga,California,34.5,88127.0,87105.0,175232,5821.0,33878.0,3.18,CA,Black or African-American,24437
4,Newark,New Jersey,34.6,138040.0,143873.0,281913,5829.0,86253.0,2.73,NJ,White,76402
5,Peoria,Illinois,33.1,56229.0,62432.0,118661,6634.0,7517.0,2.40,IL,American Indian and Alaska Native,1343
6,Avondale,Arizona,29.1,38712.0,41971.0,80683,4815.0,8355.0,3.18,AZ,Black or African-American,11592
7,West Covina,California,39.8,51629.0,56860.0,108489,3800.0,37038.0,3.56,CA,Asian,32716
8,O'Fallon,Missouri,36.0,41762.0,43270.0,85032,5783.0,3269.0,2.77,MO,Hispanic or Latino,2583
9,High Point,North Carolina,35.5,51751.0,58077.0,109828,5204.0,16315.0,2.65,NC,Asian,11060


In [31]:
us_cities[us_cities['State Code']=='TX']['City'].unique()

array(['Laredo', 'Flower Mound', 'Corpus Christi', 'Bryan', 'Killeen',
       'El Paso', 'Richardson', 'Waco', 'Garland', 'Wichita Falls',
       'Amarillo', 'Fort Worth', 'Beaumont', 'Grand Prairie', 'Mesquite',
       'Edinburg', 'Victoria', 'Abilene', 'Houston', 'New Braunfels',
       'McKinney', 'Mission', 'McAllen', 'Harlingen', 'Missouri City',
       'Temple', 'Atascocita', 'Frisco', 'College Station', 'Lewisville',
       'Dallas', 'Austin', 'Odessa', 'Round Rock', 'Denton', 'Pearland',
       'Carrollton', 'Cedar Park', 'San Angelo', 'North Richland Hills',
       'Brownsville', 'Tyler', 'San Antonio', 'Plano', 'Midland',
       'The Woodlands', 'Longview', 'Lubbock', 'Conroe', 'Allen',
       'Pasadena', 'Arlington', 'Irving', 'Bay', 'Sugar Land', 'Pharr',
       'League City'], dtype=object)

In [32]:
np.sum(us_cities.City.str.contains('County'))

15

In [33]:
us_cities[us_cities.City.str.contains('County') | us_cities.City.str.contains('county')]

Unnamed: 0,City,State,Median Age,Male Population,Female Population,Total Population,Number of Veterans,Foreign-born,Average Household Size,State Code,Race,Count
73,Lexington-Fayette county,Kentucky,34.4,154032.0,160456.0,314488,16661.0,28613.0,2.34,KY,American Indian and Alaska Native,3187
379,Lexington-Fayette county,Kentucky,34.4,154032.0,160456.0,314488,16661.0,28613.0,2.34,KY,Hispanic or Latino,21766
588,Louisville/Jefferson County metro government,Kentucky,37.5,298451.0,316938.0,615389,39364.0,37875.0,2.45,KY,White,456451
627,Louisville/Jefferson County metro government,Kentucky,37.5,298451.0,316938.0,615389,39364.0,37875.0,2.45,KY,Black or African-American,151256
671,Lexington-Fayette county,Kentucky,34.4,154032.0,160456.0,314488,16661.0,28613.0,2.34,KY,Asian,14066
1078,Athens-Clarke County unified government,Georgia,26.5,57415.0,65148.0,122563,3953.0,12868.0,2.44,GA,Hispanic or Latino,13159
1194,Augusta-Richmond County consolidated government,Georgia,33.7,94662.0,101917.0,196579,19085.0,7915.0,2.67,GA,American Indian and Alaska Native,1667
1249,Lexington-Fayette county,Kentucky,34.4,154032.0,160456.0,314488,16661.0,28613.0,2.34,KY,White,249339
1278,Augusta-Richmond County consolidated government,Georgia,33.7,94662.0,101917.0,196579,19085.0,7915.0,2.67,GA,White,77940
1598,Athens-Clarke County unified government,Georgia,26.5,57415.0,65148.0,122563,3953.0,12868.0,2.44,GA,White,79931


In [34]:
fname = '../../data2/GlobalLandTemperaturesByCity.csv'
df = pd.read_csv(fname)

In [35]:
df.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1743-11-01,6.068,1.737,Århus,Denmark,57.05N,10.33E
1,1743-12-01,,,Århus,Denmark,57.05N,10.33E
2,1744-01-01,,,Århus,Denmark,57.05N,10.33E
3,1744-02-01,,,Århus,Denmark,57.05N,10.33E
4,1744-03-01,,,Århus,Denmark,57.05N,10.33E


In [36]:
us_temperatures = df[df.Country=='United States']

In [37]:
us_temperatures.dt = pd.to_datetime(us_temperatures.dt)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [38]:
us_temperatures.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 687289 entries, 47555 to 8439246
Data columns (total 7 columns):
dt                               687289 non-null datetime64[ns]
AverageTemperature               661524 non-null float64
AverageTemperatureUncertainty    661524 non-null float64
City                             687289 non-null object
Country                          687289 non-null object
Latitude                         687289 non-null object
Longitude                        687289 non-null object
dtypes: datetime64[ns](1), float64(2), object(4)
memory usage: 41.9+ MB


In [39]:
us_temperatures.sort_values(by='dt', ascending=False)

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
8439246,2013-09-01,17.408,1.048,Yonkers,United States,40.99N,74.56W
7666413,2013-09-01,18.450,1.057,Toledo,United States,40.99N,83.08W
2645560,2013-09-01,27.986,1.334,Grand Prairie,United States,32.95N,96.70W
6583163,2013-09-01,21.348,0.908,Salinas,United States,36.17N,121.33W
4786153,2013-09-01,27.681,1.245,Metairie,United States,29.74N,90.46W
1356347,2013-09-01,27.786,1.206,Cape Coral,United States,26.52N,82.39W
6318079,2013-09-01,19.211,0.874,Reno,United States,39.38N,120.69W
3828652,2013-09-01,27.363,0.979,Killeen,United States,31.35N,98.01W
1982817,2013-09-01,17.503,1.130,Detroit,United States,42.59N,82.91W
2375136,2013-09-01,19.016,1.093,Fort Wayne,United States,40.99N,85.21W
