In [1]:
import numpy as np
import pandas as pd
import torch

import datetime

In [2]:
weather_df = pd.read_csv('../input/weather.csv')
spray_df = pd.read_csv('../input/spray.csv')
train_df = pd.read_csv('../input/train.csv')

In [3]:
weather_df.drop(['CodeSum', 'Depth', 'Water1', 'SnowFall'],
    axis=1,
    inplace=True)
    
train_df.drop(['Address','Block', 'Street', 'Trap', 'AddressNumberAndStreet','AddressAccuracy'],
    axis=1,
    inplace=True)

In [4]:
def to_float(x):
    if x == "M":
        return None
    return float(x)

def precip(text):
    TRACE = 1e-3
    text = text.strip()
    if text == "M":
        return None
    if text == "-":
        return None
    if text == "T":
        return TRACE
    return float(text)

species_map = {
    "CULEX RESTUANS": "100000",
    "CULEX TERRITANS": "010000",
    "CULEX PIPIENS": "001000",
    "CULEX PIPIENS/RESTUANS": "101000",
    "CULEX ERRATICUS": "000100",
    "CULEX SALINARIUS": "000010",
    "CULEX TARSALIS": "000001",
    "UNSPECIFIED CULEX": "001000",
}

weather_float = ['Tmax', 'Tmin', 'Tavg', 'DewPoint', 'WetBulb', 'Depart', 'ResultSpeed', 'ResultDir', 'AvgSpeed', 'StnPressure', 'SeaLevel']

weather_precip = ["Heat", "Cool", "Sunrise", "Sunset", "PrecipTotal"]

In [5]:
weather_df["Date"] = pd.to_datetime(weather_df['Date'], format="%Y-%m-%d")

for column in weather_df.columns:
    if column in weather_float:
        weather_df[column] = weather_df[column].apply(to_float)
    elif column in weather_precip:
        weather_df[column] = weather_df[column].apply(precip)

In [6]:
train_df["Date"] = pd.to_datetime(train_df['Date'], format="%Y-%m-%d")
train_df["Species"] = train_df["Species"].map(species_map).astype("int64")
# train_df["Species"] = train_df["Species"].astype("int64")

In [7]:
train_df.head()

Unnamed: 0,Date,Species,Latitude,Longitude,NumMosquitos,WnvPresent
0,2007-05-29,101000,41.95469,-87.800991,1,0
1,2007-05-29,100000,41.95469,-87.800991,1,0
2,2007-05-29,100000,41.994991,-87.769279,1,0
3,2007-05-29,101000,41.974089,-87.824812,1,0
4,2007-05-29,100000,41.974089,-87.824812,4,0


In [8]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10506 entries, 0 to 10505
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Date          10506 non-null  datetime64[ns]
 1   Species       10506 non-null  int64         
 2   Latitude      10506 non-null  float64       
 3   Longitude     10506 non-null  float64       
 4   NumMosquitos  10506 non-null  int64         
 5   WnvPresent    10506 non-null  int64         
dtypes: datetime64[ns](1), float64(2), int64(3)
memory usage: 492.6 KB


In [9]:
weather_df.head()

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,Sunrise,Sunset,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,1,2007-05-01,83.0,50.0,67.0,14.0,51.0,56.0,0.0,2.0,448.0,1849.0,0.0,29.1,29.82,1.7,27.0,9.2
1,2,2007-05-01,84.0,52.0,68.0,,51.0,57.0,0.0,3.0,,,0.0,29.18,29.82,2.7,25.0,9.6
2,1,2007-05-02,59.0,42.0,51.0,-3.0,42.0,47.0,14.0,0.0,447.0,1850.0,0.0,29.38,30.09,13.0,4.0,13.4
3,2,2007-05-02,60.0,43.0,52.0,,42.0,47.0,13.0,0.0,,,0.0,29.44,30.08,13.3,2.0,13.4
4,1,2007-05-03,66.0,46.0,56.0,2.0,40.0,48.0,9.0,0.0,446.0,1851.0,0.0,29.39,30.12,11.7,7.0,11.9


In [10]:
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2944 entries, 0 to 2943
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Station      2944 non-null   int64         
 1   Date         2944 non-null   datetime64[ns]
 2   Tmax         2944 non-null   float64       
 3   Tmin         2944 non-null   float64       
 4   Tavg         2933 non-null   float64       
 5   Depart       1472 non-null   float64       
 6   DewPoint     2944 non-null   float64       
 7   WetBulb      2940 non-null   float64       
 8   Heat         2933 non-null   float64       
 9   Cool         2933 non-null   float64       
 10  Sunrise      1472 non-null   float64       
 11  Sunset       1472 non-null   float64       
 12  PrecipTotal  2942 non-null   float64       
 13  StnPressure  2940 non-null   float64       
 14  SeaLevel     2935 non-null   float64       
 15  ResultSpeed  2944 non-null   float64       
 16  Result

In [11]:
def find_station(lat, lon):
    station_1 = np.array([41.995,-87.933])
    station_2 = np.array([41.785, -86.752])
    loc = np.array([lat,lon])

    d1 = np.sqrt((station_1[0] - lat)**2 + (station_1[1] - lon)**2)
    d2 = np.sqrt((station_2[0] - lat)**2 + (station_2[1] - lon)**2)

    if d1 < d2:
        return 1 
    elif d2 < d1:
        return 2


In [12]:
for i in range(10506):
    station = find_station(train_df.iloc[i]["Latitude"],
        train_df.iloc[i]["Longitude"])

    print(f"{i} is closer to station {station}")

0 is closer to station 1
1 is closer to station 1
2 is closer to station 1
3 is closer to station 1
4 is closer to station 1
5 is closer to station 1
6 is closer to station 1
7 is closer to station 1
8 is closer to station 1
9 is closer to station 1
10 is closer to station 1
11 is closer to station 1
12 is closer to station 1
13 is closer to station 1
14 is closer to station 1
15 is closer to station 1
16 is closer to station 1
17 is closer to station 1
18 is closer to station 1
19 is closer to station 1
20 is closer to station 1
21 is closer to station 1
22 is closer to station 1
23 is closer to station 1
24 is closer to station 1
25 is closer to station 1
26 is closer to station 1
27 is closer to station 1
28 is closer to station 1
29 is closer to station 1
30 is closer to station 1
31 is closer to station 1
32 is closer to station 1
33 is closer to station 1
34 is closer to station 1
35 is closer to station 1
36 is closer to station 1
37 is closer to station 1
38 is closer to statio