# Making a Data Dictionary

### We want to make a data dictionary in pandas to classify, define and give the datatype for each of the features that we'll use during modeling.

In [1]:
import pandas as pd

We'll first set up our columns and rows, and then fill in the data afterwards.

In [2]:
Columns = ['Category','Description', 'Datatype']
Names= ['Address', 'AddressAccuracy','Heat', 'Heat_roll_28', 'Cool', 'PrecipTotal',
       'PrecipTotal_roll', 'WetBulb', 'Tmin', 
       'Tmin_roll', 'Tmax', 'Tmax_roll', 'Tavg', 
       'Depart', 'DewPoint', 'DewPoint_roll', 'Sunrise',
       'Sunset', 'Depth', 'Water1', 'SnowFall', 
       'StnPressure', 'SeaLevel', 'ResultSpeed', 
       'ResultSpeed_roll', 'ResultDir', 'AvgSpeed', 
       'Date', 'Street', 'Trap', 'Lat_int', 'Long_int',
       'Species', 'Block']
data_dict= pd.DataFrame(columns= Columns, index= Names)

We've set 'names' to be our index. We want to alphabetize this in order to make it easier to read:

In [3]:
data_dict.sort_index(inplace=True)

Now we'll go in and fill all of those rows/columns:

In [4]:
data_dict.loc['AvgSpeed']['Category']='Weather'
data_dict.loc['AvgSpeed']['Description']= 'Daily average wind speed (MPH)'
data_dict.loc['AvgSpeed']['Datatype'] = 'int'

In [5]:
data_dict.loc['Block']['Category']='Spatial/Temporal'
data_dict.loc['Block']['Description']= 'Aereal description, related to lat/long'
data_dict.loc['Block']['Datatype'] = 'int'

In [6]:
data_dict.loc['Cool']['Category']='Weather'
data_dict.loc['Cool']['Description']= 'Departure (in degrees) from 65 degree fahrenheit baseline'
data_dict.loc['Cool']['Datatype'] = 'int'

In [7]:
data_dict.loc['Date']['Category']='Spatial/Temporal'
data_dict.loc['Date']['Description']= 'Date in format YYYY-MM-DD)'
data_dict.loc['Date']['Datatype'] = 'datetime'

In [8]:
data_dict.loc['Depart']['Category']='Weather'
data_dict.loc['Depart']['Description']= 'Temp departure from historical normal'
data_dict.loc['Depart']['Datatype'] = 'float'

In [9]:
data_dict.loc['DewPoint']['Category']='Weather'
data_dict.loc['DewPoint']['Description']= 'The temperature at which water condensates. A factor in measuring relative humidity.'
data_dict.loc['DewPoint']['Datatype'] = 'float'

In [10]:
data_dict.loc['DewPoint_roll']['Category']='Weather'
data_dict.loc['DewPoint_roll']['Description']= 'Rolling mean of dewpoint column'
data_dict.loc['DewPoint_roll']['Datatype'] = 'float'

In [11]:
data_dict.loc['Heat']['Category']='Weather'
data_dict.loc['Heat']['Description']= 'Departure (in degrees) from 65 degree fahrenheit baseline'
data_dict.loc['Heat']['Datatype'] = 'int'

In [12]:
data_dict.loc['Heat_roll_28']['Category']='Weather'
data_dict.loc['Heat_roll_28']['Description']= 'Rolling mean of "heat" column'
data_dict.loc['Heat_roll_28']['Datatype'] = 'int'

In [13]:
data_dict.loc['PrecipTotal']['Category']='Weather'
data_dict.loc['PrecipTotal']['Description']= 'Amout of rainfall (in inches)'
data_dict.loc['PrecipTotal']['Datatype'] = 'int'

In [14]:
data_dict.loc['PrecipTotal_roll']['Category']='Weather'
data_dict.loc['PrecipTotal_roll']['Description']= 'Rolling average rainfall'
data_dict.loc['PrecipTotal_roll']['Datatype'] = 'int'

In [15]:
data_dict.loc['WetBulb']['Category']='Weather'
data_dict.loc['WetBulb']['Description']= 'Temperature recorded via wetbulb thermometer. Takes into account humidity and ambient temp. Similar to heat index.'
data_dict.loc['WetBulb']['Datatype'] = 'int'

In [16]:
data_dict.loc['Tmin']['Category']='Weather'
data_dict.loc['Tmin']['Description']= 'Minimum temperature for a given day (fahrenheit)'
data_dict.loc['Tmin']['Datatype'] = 'int'

In [17]:
data_dict.loc['Tmin_roll']['Category']='Weather'
data_dict.loc['Tmin_roll']['Description']= 'Minimum temperature rolling mean'
data_dict.loc['Tmin_roll']['Datatype'] = 'int'

In [18]:
data_dict.loc['Tmax']['Category']='Weather'
data_dict.loc['Tmax']['Description']= 'Maximum temperature for a given day (fahrenheit)'
data_dict.loc['Tmax']['Datatype'] = 'int'

In [19]:
data_dict.loc['Tmax_roll']['Category']='Weather'
data_dict.loc['Tmax_roll']['Description']= 'Maximum temperature rolling mean'
data_dict.loc['Tmax_roll']['Datatype'] = 'int'

In [20]:
data_dict.loc['Tavg']['Category']='Weather'
data_dict.loc['Tavg']['Description']= 'Daily average temperature (farenheit)'
data_dict.loc['Tavg']['Datatype'] = 'int'

In [21]:
data_dict.loc['Sunrise']['Category']='Weather'
data_dict.loc['Sunrise']['Description']= 'Time of sunrise (CST). Calculated, not observed.'
data_dict.loc['Sunrise']['Datatype'] = 'int'

In [22]:
data_dict.loc['Sunset']['Category']='Weather'
data_dict.loc['Sunset']['Description']= 'Time of sunset (CST). Calculated, not observed.'
data_dict.loc['Sunset']['Datatype'] = 'int'

In [23]:
data_dict.loc['Depth']['Category']='Weather'
data_dict.loc['Depth']['Description']= 'Depth of snowfall (in)'
data_dict.loc['Depth']['Datatype'] = 'float'

In [24]:
data_dict.loc['SnowFall']['Category']='Weather'
data_dict.loc['SnowFall']['Description']= 'Depth of snowfall (in)'
data_dict.loc['SnowFall']['Datatype'] = 'float'

In [25]:
data_dict.loc['Water1']['Category']='Weather'
data_dict.loc['Water1']['Description']= 'Depth of snowmelt (in)'
data_dict.loc['Water1']['Datatype'] = 'float'

In [26]:
data_dict.loc['StnPressure']['Category']='Weather'
data_dict.loc['StnPressure']['Description']= 'Daily average atmospheric pressure (inches/HG)'
data_dict.loc['StnPressure']['Datatype'] = 'float'

In [27]:
data_dict.loc['SeaLevel']['Category']='Weather'
data_dict.loc['SeaLevel']['Description']= 'Daily average sea level (feet)'
data_dict.loc['SeaLevel']['Datatype'] = 'int'

In [28]:
data_dict.loc['ResultSpeed']['Category']='Weather'
data_dict.loc['ResultSpeed']['Description']= 'Resultant wind speed (vector sum of wind and direction)'
data_dict.loc['ResultSpeed']['Datatype'] = 'float'

In [29]:
data_dict.loc['ResultSpeed_roll']['Category']='Weather'
data_dict.loc['ResultSpeed_roll']['Description']= 'Rolling mean of resultant wind speed'
data_dict.loc['ResultSpeed_roll']['Datatype'] = 'float'

In [30]:
data_dict.loc['ResultDir']['Category']='Weather'
data_dict.loc['ResultDir']['Description']= 'Resultant wind direction (whole degress)'
data_dict.loc['ResultDir']['Datatype'] = 'float'

In [31]:
data_dict.loc['AvgSpeed']['Category']='Weather'
data_dict.loc['AvgSpeed']['Description']= 'Daily average wind speed (MPH)'
data_dict.loc['AvgSpeed']['Datatype'] = 'float'

In [32]:
data_dict.loc['Street']['Category']='Spatial/Temporal'
data_dict.loc['Street']['Description']= 'Street address of mosquito trap'
data_dict.loc['Street']['Datatype'] = 'int'

In [33]:
data_dict.loc['Trap']['Category']='Spatial/Temporal'
data_dict.loc['Trap']['Description']= 'Mosquito trap id'
data_dict.loc['Trap']['Datatype'] = 'str'

In [34]:
data_dict.loc['Lat_int']['Category']='Spatial/Temporal'
data_dict.loc['Lat_int']['Description']= 'Latitude of trap rounded to nearest integer'
data_dict.loc['Lat_int']['Datatype'] = 'int'

In [35]:
data_dict.loc['Long_int']['Category']='Spatial/Temporal'
data_dict.loc['Long_int']['Description']= 'Longitude of trap rounded to nearest integer'
data_dict.loc['Long_int']['Datatype'] = 'int'

In [36]:
data_dict.loc['Species']['Category']='Spatial/Temporal'
data_dict.loc['Species']['Description']= 'Species of mosquito'
data_dict.loc['Species']['Datatype'] = 'str'

In [37]:
data_dict.loc['Address']['Category']='Spatial/Temporal'
data_dict.loc['Address']['Description']='Address of trap'
data_dict.loc['Address']['Datatype'] = 'str'

In [38]:
data_dict.loc['AddressAccuracy']['Category']='Spatial/Temporal'
data_dict.loc['AddressAccuracy']['Description']='Accuracy of trap address'
data_dict.loc['AddressAccuracy']['Datatype'] = 'str'

Let's take a look at our dataframe and make sure it looks correct:

In [39]:
data_dict

Unnamed: 0,Category,Description,Datatype
Address,Spatial/Temporal,Address of trap,str
AddressAccuracy,Spatial/Temporal,Accuracy of trap address,str
AvgSpeed,Weather,Daily average wind speed (MPH),float
Block,Spatial/Temporal,"Aereal description, related to lat/long",int
Cool,Weather,Departure (in degrees) from 65 degree fahrenhe...,int
Date,Spatial/Temporal,Date in format YYYY-MM-DD),datetime
Depart,Weather,Temp departure from historical normal,float
Depth,Weather,Depth of snowfall (in),float
DewPoint,Weather,The temperature at which water condensates. A ...,float
DewPoint_roll,Weather,Rolling mean of dewpoint column,float


All set! We'll save this as a csv so it can be imported into any one of our other notebooks in case we want to look up a particular feature.

In [40]:
data_dict.to_csv('../data/data_dict.csv')