In [1]:
# all imports
import re
import os
import pandas as pd
import numpy as np
import configparser

### Read paths from configfile

In [2]:
config = configparser.ConfigParser()
config.read('config.cfg')

# get paths
i94_label_path = config.get('DATA', 'LABEL_DESCRIPTION')
us_cities_path = config.get('DATA', 'US_CITIES')
temp_path = config.get('DATA', 'TEMPERATURE')
airports_path = config.get('DATA', 'AIPORTS')

# output_dir
output_dir = config.get('OUTPUT', 'OUTPUT_DIR')

### Create empty dictionaries for citi, ports, and states

In [3]:
citi_code = {'id':[],
            'country':[]}

port = {
    'code': [],
    'city': [],
    'state_code': []
}

state = {
    'code': [],
    'name': []
}

### Regex patterns

In [4]:
# citi_code pattern string
pattern_str = """
                \s*         # Match any spaces
                ([\d]+)     # create a group and match 1 or more numerics
                \s+         # match one or more spaces
                =           # match '='
                \s+         
                '(.*)'      # match everything within ''
              """

# citi code regex pattern
pattern = re.compile(pattern_str, re.VERBOSE)

# port regex pattern
# extracts port code, and name
port_pattern = re.compile("\s*'(\S+)'\s+=\s+'([\S\s#]+),\s*([\S\s#]+)'")

# state regex pattern
# extracts code, and name
state_pattern = re.compile("\s*'([a-zA-Z0-9]+)'='([a-zA-Z\.\s]+)'")

### Read the labels file and fill dictionaries

In [5]:
with open(i94_label_path, 'r') as fp:
    for i, line in enumerate(fp):
        if i > 8 and i < 245:
            match = re.search(pattern, line)
            citi_code['id'].append(match.group(1))
            citi_code['country'].append(match.group(2))
        if i > 301 and i < 893:
            match = re.search(port_pattern, line)
            try:
                port['code'].append(match.group(1))
                port['city'].append(match.group(2))
                port['state_code'].append(match.group(3))
            except:
                port['code'].append(None)
                port['city'].append(None)
                port['state_code'].append(None)
        if i > 980 and i < 1036:
            match = re.search(state_pattern, line)
            state['code'].append(match.group(1))
            state['name'].append(match.group(2))

### Create dataframes from dictionaries

In [6]:
# citi code dataframe
citi_code_df = pd.DataFrame(citi_code)
citi_code_df = citi_code_df.set_index('id')

citi_code_df.head()

Unnamed: 0_level_0,country
id,Unnamed: 1_level_1
582,"MEXICO Air Sea, and Not Reported (I-94, no lan..."
236,AFGHANISTAN
101,ALBANIA
316,ALGERIA
102,ANDORRA


In [7]:
# airport dataframe
port_df = pd.DataFrame(port)
port_df = port_df.set_index('code')
port_df['state_code'] = port_df.state_code.str.strip()
port_df = port_df.dropna(how='all')
port_df.head()

Unnamed: 0_level_0,city,state_code
code,Unnamed: 1_level_1,Unnamed: 2_level_1
ALC,ALCAN,AK
ANC,ANCHORAGE,AK
BAR,BAKER AAF - BAKER ISLAND,AK
DAC,DALTONS CACHE,AK
PIZ,DEW STATION PT LAY DEW,AK


In [8]:
values = ['AR (BPS)', 'CA (BPS)', 'CO #ARPT', 'FL #ARPT', 'LA (BPS)',
       'ME (BPS)', 'MT (BPS)', 'NM (BPS)', 'SC #ARPT', 'TX (BPS)',
       'VA #ARPT', 'VT (I-91)', 'VT (RT. 5)', 'VT (BP - SECTOR HQ)',
       'WASHINGTON #INTL', 'WA (BPS)']
# clean state_code
temp = np.where(port_df.state_code.isin(values), port_df.state_code.str[:2],\
                np.where(port_df.state_code.str.len()==2, port_df.state_code, np.nan))

us_state_codes = np.where(temp=='MX', np.nan, temp)
port_df['state_code'] = us_state_codes
port_df = port_df.dropna(how='any')

In [9]:
# states dataframe
states = pd.DataFrame(state)
states = states.set_index('code')
states.head()

Unnamed: 0_level_0,name
code,Unnamed: 1_level_1
AL,ALABAMA
AK,ALASKA
AZ,ARIZONA
AR,ARKANSAS
CA,CALIFORNIA


In [10]:
# visa category dataframe
visa_category = pd.DataFrame({
    'id': [1,2,3],
    'category': ['Business', 'Pleasure', 'Student']
})
visa_category = visa_category.set_index('id')
visa_category

Unnamed: 0_level_0,category
id,Unnamed: 1_level_1
1,Business
2,Pleasure
3,Student


In [11]:
# travel mode dataframe

travel_mode = pd.DataFrame({
    'id': [1,2,3,9],
    'mode': ['Air', 'Sea', 'land', 'Not reported']
})

travel_mode = travel_mode.set_index('id')
travel_mode

Unnamed: 0_level_0,mode
id,Unnamed: 1_level_1
1,Air
2,Sea
3,land
9,Not reported


### Save all dataframes

In [12]:
# output paths
citi_code_path = os.path.join(output_dir,'country_code.csv')
port_path = os.path.join(output_dir,'port_immigration.csv')
states_path = os.path.join(output_dir,'state_code.csv')
visa_category_path = os.path.join(output_dir,'visa_category.csv')
travel_mode_path = os.path.join(output_dir,'travel_mode.csv')

In [13]:
citi_code_df.to_csv(citi_code_path)
port_df.to_csv(port_path)
states.to_csv(states_path)
visa_category.to_csv(visa_category_path)
travel_mode.to_csv(travel_mode_path)

### Create us airports dataframe

In [14]:
airports = pd.read_csv(airports_path)

In [15]:
airports.head()

Unnamed: 0,ident,type,name,elevation_ft,continent,iso_country,iso_region,municipality,gps_code,iata_code,local_code,coordinates
0,00A,heliport,Total Rf Heliport,11.0,,US,US-PA,Bensalem,00A,,00A,"-74.93360137939453, 40.07080078125"
1,00AA,small_airport,Aero B Ranch Airport,3435.0,,US,US-KS,Leoti,00AA,,00AA,"-101.473911, 38.704022"
2,00AK,small_airport,Lowell Field,450.0,,US,US-AK,Anchor Point,00AK,,00AK,"-151.695999146, 59.94919968"
3,00AL,small_airport,Epps Airpark,820.0,,US,US-AL,Harvest,00AL,,00AL,"-86.77030181884766, 34.86479949951172"
4,00AR,closed,Newport Hospital & Clinic Heliport,237.0,,US,US-AR,Newport,,,,"-91.254898, 35.6087"


In [16]:
us_airports = airports[airports.iso_country.str.lower() == 'us']

us_airports = us_airports[us_airports.type.isin(['small_airport', 'medium_airport', 'large_airport'])]

us_intl_airports = us_airports[us_airports.name.str.contains('International')]

us_intl_airports = us_intl_airports[~us_intl_airports.municipality.isnull()]

print(us_intl_airports.shape)

long_lat = us_intl_airports['coordinates'].str.split(',', expand=True)
long_lat.columns = ['longitude', 'latitude']
us_intl_final = pd.concat([us_intl_airports, long_lat], axis=1).drop('coordinates', axis=1)

us_city_code = pd.read_csv(port_path)

us_city_code.city = us_city_code.city.str.lower()

us_intl_final.municipality = us_intl_final.municipality.str.lower() 

us_intl_final = us_city_code.merge(us_intl_final, left_on='city', right_on='municipality')[['ident', 'code', 'city', 'state_code',\
                                                                                            'type', 'name','elevation_ft', 'gps_code',\
                                                                                            'iata_code', 'local_code', 'latitude',\
                                                                                            'longitude']]

us_intl_final.rename(columns={'code': 'city_code', 'ident': 'airport_id'}, inplace=True)

(224, 12)


In [17]:
us_intl_path = os.path.join(output_dir, 'us_interantional_airport_codes.csv')
us_intl_final.to_csv(us_intl_path, index=False)

### Create us_states Dataframe

In [18]:
us_cities = pd.read_csv(us_cities_path, sep=';')

In [19]:
us_cities.columns

Index(['City', 'State', 'Median Age', 'Male Population', 'Female Population',
       'Total Population', 'Number of Veterans', 'Foreign-born',
       'Average Household Size', 'State Code', 'Race', 'Count'],
      dtype='object')

In [20]:
columns = ['City', 'State Code', 'State', 'Total Population', 'Female Population', 'Number of Veterans', 'Foreign-born',\
           'Average Household Size', 'Race', 'Count']

In [21]:
us_cities = us_cities[columns]

In [22]:
us_cities['num_households'] = np.round(us_cities['Total Population']/us_cities['Average Household Size'])

In [23]:
us_cities.head()

Unnamed: 0,City,State Code,State,Total Population,Female Population,Number of Veterans,Foreign-born,Average Household Size,Race,Count,num_households
0,Silver Spring,MD,Maryland,82463,41862.0,1562.0,30908.0,2.6,Hispanic or Latino,25924,31717.0
1,Quincy,MA,Massachusetts,93629,49500.0,4147.0,32935.0,2.39,White,58723,39175.0
2,Hoover,AL,Alabama,84839,46799.0,4819.0,8229.0,2.58,Asian,4759,32883.0
3,Rancho Cucamonga,CA,California,175232,87105.0,5821.0,33878.0,3.18,Black or African-American,24437,55104.0
4,Newark,NJ,New Jersey,281913,143873.0,5829.0,86253.0,2.73,White,76402,103265.0


In [24]:
us_states_race = us_cities.groupby(['State Code', 'Race']).\
                    mean()['Count'].unstack()

In [25]:
us_states_race.head()

Race,American Indian and Alaska Native,Asian,Black or African-American,Hispanic or Latino,White
State Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AK,36339.0,36825.0,23107.0,27261.0,212696.0
AL,1347.333333,4109.857143,74438.285714,5616.142857,71274.285714
AR,1563.5,4412.4,24934.666667,12968.833333,64122.166667
AZ,8106.75,14323.9375,18513.875,94259.8125,224475.6875
CA,3087.584615,33409.779412,15051.536765,71944.992701,108796.562044


In [26]:
us_states_race = us_states_race.div(us_states_race.sum(axis=1), axis=0)

In [27]:
us_states = us_cities.drop('Race', axis=1).\
    drop_duplicates(['City', 'State Code']).\
    drop(['City', 'Count'], axis=1)

In [28]:
us_states.head()

Unnamed: 0,State Code,State,Total Population,Female Population,Number of Veterans,Foreign-born,Average Household Size,num_households
0,MD,Maryland,82463,41862.0,1562.0,30908.0,2.6,31717.0
1,MA,Massachusetts,93629,49500.0,4147.0,32935.0,2.39,39175.0
2,AL,Alabama,84839,46799.0,4819.0,8229.0,2.58,32883.0
3,CA,California,175232,87105.0,5821.0,33878.0,3.18,55104.0
4,NJ,New Jersey,281913,143873.0,5829.0,86253.0,2.73,103265.0


In [29]:
us_states = us_states.groupby(['State Code', 'State']).agg({
    'Total Population': 'sum',
    'Female Population': 'sum',
    'Number of Veterans': 'sum',
    'Foreign-born': 'sum',
    'num_households': 'sum'
})

In [30]:
us_states['avg_households'] = np.round(us_states['Total Population']/us_states['num_households'], 2)

In [31]:
us_states = us_states.join(us_states_race)

In [32]:
us_states_path = os.path.join(output_dir, 'us_states.csv')
us_states.to_csv(us_states_path)

### Create us_cities dataframe

In [33]:
us_cities.head()

Unnamed: 0,City,State Code,State,Total Population,Female Population,Number of Veterans,Foreign-born,Average Household Size,Race,Count,num_households
0,Silver Spring,MD,Maryland,82463,41862.0,1562.0,30908.0,2.6,Hispanic or Latino,25924,31717.0
1,Quincy,MA,Massachusetts,93629,49500.0,4147.0,32935.0,2.39,White,58723,39175.0
2,Hoover,AL,Alabama,84839,46799.0,4819.0,8229.0,2.58,Asian,4759,32883.0
3,Rancho Cucamonga,CA,California,175232,87105.0,5821.0,33878.0,3.18,Black or African-American,24437,55104.0
4,Newark,NJ,New Jersey,281913,143873.0,5829.0,86253.0,2.73,White,76402,103265.0


In [34]:
us_cities_race = us_cities[['City', 'State Code', 'Race', 'Count']]

In [35]:
us_cities_race = us_cities_race.set_index(['City', 'State Code', 'Race']).unstack(-1)

In [36]:
us_cities_race = us_cities_race['Count'].fillna(0)

In [37]:
us_cities_race.head()

Unnamed: 0_level_0,Race,American Indian and Alaska Native,Asian,Black or African-American,Hispanic or Latino,White
City,State Code,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Abilene,TX,1813.0,2929.0,14449.0,33222.0,95487.0
Akron,OH,1845.0,9033.0,66551.0,3684.0,129192.0
Alafaya,FL,0.0,10336.0,6577.0,34897.0,63666.0
Alameda,CA,1329.0,27984.0,7364.0,8265.0,44232.0
Albany,GA,445.0,650.0,53440.0,1783.0,17160.0


In [38]:
us_cities = us_cities.drop_duplicates(['City', 'State Code'])

In [39]:
us_cities = us_cities.set_index(['City', 'State Code']).drop(['Race', 'Count'], axis=1)

In [40]:
us_cities = us_cities.join(us_cities_race)

In [41]:
for col in ['American Indian and Alaska Native', 'Asian',
       'Black or African-American', 'Hispanic or Latino', 'White']:
    us_cities[col] = us_cities[col]/us_cities['Total Population']

In [62]:
us_cities = us_cities.reset_index()

In [63]:
us_cities.City = us_cities.City.str.lower()

In [68]:
us_cities = us_city_code.merge(us_cities, left_on=['city', 'state_code'], \
                               right_on=['City', 'State Code']).\
                         drop(['City', 'State Code'], axis=1)

In [42]:
us_cities_path = os.path.join(output_dir, 'us_cities.csv')
us_cities.to_csv(us_cities_path)

### Temperature dataframe

In [43]:
fname = temp_path
df = pd.read_csv(fname)

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8599212 entries, 0 to 8599211
Data columns (total 7 columns):
dt                               object
AverageTemperature               float64
AverageTemperatureUncertainty    float64
City                             object
Country                          object
Latitude                         object
Longitude                        object
dtypes: float64(2), object(5)
memory usage: 459.2+ MB


In [45]:
df['dt'] = pd.to_datetime(df['dt'])

In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8599212 entries, 0 to 8599211
Data columns (total 7 columns):
dt                               datetime64[ns]
AverageTemperature               float64
AverageTemperatureUncertainty    float64
City                             object
Country                          object
Latitude                         object
Longitude                        object
dtypes: datetime64[ns](1), float64(2), object(4)
memory usage: 459.2+ MB


In [47]:
temp_df = df[df.dt.dt.year == df.dt.dt.year.max()]

In [48]:
temp_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31590 entries, 3230 to 8599211
Data columns (total 7 columns):
dt                               31590 non-null datetime64[ns]
AverageTemperature               28520 non-null float64
AverageTemperatureUncertainty    28520 non-null float64
City                             31590 non-null object
Country                          31590 non-null object
Latitude                         31590 non-null object
Longitude                        31590 non-null object
dtypes: datetime64[ns](1), float64(2), object(4)
memory usage: 1.9+ MB


In [49]:
us_temp_df = temp_df[temp_df.Country == 'United States']

In [50]:
us_temp_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2313 entries, 49871 to 8439246
Data columns (total 7 columns):
dt                               2313 non-null datetime64[ns]
AverageTemperature               2312 non-null float64
AverageTemperatureUncertainty    2312 non-null float64
City                             2313 non-null object
Country                          2313 non-null object
Latitude                         2313 non-null object
Longitude                        2313 non-null object
dtypes: datetime64[ns](1), float64(2), object(4)
memory usage: 144.6+ KB


In [51]:
us_temp_df.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
49871,2013-01-01,6.32,0.267,Abilene,United States,32.95N,100.53W
49872,2013-02-01,8.116,0.222,Abilene,United States,32.95N,100.53W
49873,2013-03-01,12.503,0.273,Abilene,United States,32.95N,100.53W
49874,2013-04-01,15.753,0.342,Abilene,United States,32.95N,100.53W
49875,2013-05-01,22.545,0.125,Abilene,United States,32.95N,100.53W


In [52]:
us_city_code.head()

Unnamed: 0,code,city,state_code
0,ALC,alcan,AK
1,ANC,anchorage,AK
2,BAR,baker aaf - baker island,AK
3,DAC,daltons cache,AK
4,PIZ,dew station pt lay dew,AK


In [53]:
us_temp_df.City = us_temp_df.City.str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [54]:
us_temp_df = us_city_code.merge(us_temp_df, left_on='city', right_on='City').\
        drop(['Country', 'City', 'city', 'state_code'], axis=1).\
        rename(columns={'code':'city_code'})

In [55]:
us_temp_df.head()

Unnamed: 0,city_code,dt,AverageTemperature,AverageTemperatureUncertainty,Latitude,Longitude
0,ANC,2013-01-01,-10.059,0.528,61.88N,151.13W
1,ANC,2013-02-01,-8.327,0.546,61.88N,151.13W
2,ANC,2013-03-01,-8.634,0.555,61.88N,151.13W
3,ANC,2013-04-01,-6.421,0.355,61.88N,151.13W
4,ANC,2013-05-01,3.764,0.523,61.88N,151.13W


In [57]:
us_temp_path = os.path.join(output_dir, 'us_temperature_2013.csv')
us_temp_df.to_csv(us_temp_path, index=False)