In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from __future__ import print_function
import numpy as np
import datetime
import csv

In [None]:
from zipfile import ZipFile
from io import TextIOWrapper
import tensorflow as tf

In [None]:
from sklearn import metrics
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [None]:
PREFIX = '/kaggle/input/predict-west-nile-virus/'

In [None]:
df_train = pd.read_csv(PREFIX + "train.csv.zip", compression='zip')

In [None]:
df_train

In [None]:
df_weather = pd.read_csv(PREFIX + 'weather.csv.zip', compression='zip')

In [None]:
df_weather

In [None]:
with ZipFile(PREFIX + 'weather.csv.zip') as zf:
    for line in csv.DictReader(TextIOWrapper(zf.open('weather.csv'))):
        print(line)
        break

In [None]:
species_map = {'CULEX RESTUANS' : "100000",
              'CULEX TERRITANS' : "010000", 
              'CULEX PIPIENS'   : "001000", 
              'CULEX PIPIENS/RESTUANS' : "101000", 
              'CULEX ERRATICUS' : "000100", 
              'CULEX SALINARIUS': "000010", 
              'CULEX TARSALIS' :  "000001",
              'UNSPECIFIED CULEX': "001000"} # Treating unspecified as PIPIENS (http://www.ajtmh.org/content/80/2/268.full)

def date(text):
    return datetime.datetime.strptime(text, "%Y-%m-%d").date()
    
def precip(text):
    TRACE = 1e-3
    text = text.strip()
    if text == "M":
        return None
    if text == "T":
        return TRACE
    return float(text)

In [None]:
# fill nans with values from the station with valid parameters
# for example, if temperature is not available for station1, it would be taken from station 2.
def impute_missing_weather_station_values(weather):
    # Stupid simple
    for k, v in weather.items():
        if v[0] is None:
            v[0] = v[1]
        elif v[1] is None:
            v[1] = v[0]
        for k1 in v[0]:
            if v[0][k1] is None:
                v[0][k1] = v[1][k1]
        for k1 in v[1]:
            if v[1][k1] is None:
                v[1][k1] = v[0][k1]

In [None]:
def load_weather():
    weather = {}
    with ZipFile(PREFIX + 'weather.csv.zip') as zf:
        for line in csv.DictReader(TextIOWrapper(zf.open('weather.csv'))):
        # for line in csv.DictReader(open(PREFIX + "weather.csv.zip")):
            for name, converter in {"Date" : date,
                                    "Tmax" : float,"Tmin" : float,"Tavg" : float,
                                    "DewPoint" : float, "WetBulb" : float,
                                    "PrecipTotal" : precip,
                                    "Depart" : float, 
                                    "ResultSpeed" : float,"ResultDir" : float,"AvgSpeed" : float,
                                    "StnPressure" : float, "SeaLevel" : float}.items():
                x = line[name].strip()
                line[name] = converter(x) if (x != "M") else None
            station = int(line["Station"]) - 1
            assert station in [0,1]
            dt = line["Date"]
            if dt not in weather:
                weather[dt] = [None, None]
            assert weather[dt][station] is None, "duplicate weather reading {0}:{1}".format(dt, station)
            weather[dt][station] = line
    impute_missing_weather_station_values(weather)        
    return weather

In [None]:
def load_training():
    training = []
    with ZipFile(PREFIX + 'train.csv.zip') as zf:
        for line in csv.DictReader(TextIOWrapper(zf.open('train.csv'))):
            # for line in csv.DictReader(open(PREFIX + "train.csv")):
            for name, converter in {"Date" : date, 
                                    "Latitude" : float, "Longitude" : float,
                                    "NumMosquitos" : int, "WnvPresent" : int}.items():
                line[name] = converter(line[name])
            training.append(line)
    return training
    
def load_testing():
    training = []
    with ZipFile(PREFIX + 'test.csv.zip') as zf:
        for line in csv.DictReader(TextIOWrapper(zf.open('test.csv'))):
            # for line in csv.DictReader(open(PREFIX + "test.csv")):
            for name, converter in {"Date" : date, 
                                    "Latitude" : float, "Longitude" : float}.items():
                line[name] = converter(line[name])
            training.append(line)
    return training

In [None]:
def closest_station(lat, longi):
    # Chicago is small enough that we can treat coordinates as rectangular.
    stations = np.array([[41.995, -87.933],
                         [41.786, -87.752]])
    loc = np.array([lat, longi])
    deltas = stations - loc[None, :]
    dist2 = (deltas**2).sum(1)
    return np.argmin(dist2)

In [None]:
def normalize(X, mean=None, std=None):
    count = X.shape[1]
    if mean is None:
        mean = np.nanmean(X, axis=0)
    for i in range(count):
        X[np.isnan(X[:,i]), i] = mean[i]
    if std is None:
        std = np.std(X, axis=0)
    for i in range(count):
        X[:,i] = (X[:,i] - mean[i]) / std[i]
    return mean, std

In [None]:
def scaled_count(record):
    SCALE = 10.0
    if "NumMosquitos" not in record:
        # This is test data
        return 1
    return int(np.ceil(record["NumMosquitos"] / SCALE))

In [None]:
def assemble_X(base, weather):
    X = []
    for b in base:
        date = b["Date"]
        lat, longi = b["Latitude"], b["Longitude"]
        case = [date.year, date.month, date.day, lat, longi]
        # Look at a selection of past weather values
        for days_ago in [1,2,3,5,8,13]:
            day = date - datetime.timedelta(days=days_ago)
            for obs in ["Tmax","Tmin","Tavg","DewPoint","WetBulb","PrecipTotal","Depart"]:
                station = closest_station(lat, longi)
                case.append(weather[day][station][obs])
        # Specify which mosquitos are present
        species_vector = [float(x) for x in species_map[b["Species"]]]
        case.extend(species_vector)
        # Weight each observation by the number of mosquitos seen. Test data
        # Doesn't have this column, so in that case use 1. This accidentally
        # Takes into account multiple entries that result from >50 mosquitos
        # on one day. 
        for repeat in range(scaled_count(b)):
            X.append(case)    
    X = np.asarray(X, dtype=np.float32)
    return X

In [None]:
def assemble_y(base):
    y = []
    for b in base:
        present = b["WnvPresent"]
        for repeat in range(scaled_count(b)):
            y.append(present)    
    return np.asarray(y, dtype=np.int32).reshape(-1,1)

In [None]:
# this was for another NN library.

class AdjustVariable(object):
    def __init__(self, variable, target, half_life=20):
        self.variable = variable
        self.target = target
        self.half_life = half_life
    def __call__(self, nn, train_history):
        delta = self.variable.get_value() - self.target
        delta /= 2**(1.0/self.half_life)
        self.variable.set_value(np.float32(self.target + delta))

# Loading Train

In [None]:
weather = load_weather()

In [None]:
weather.items()

In [None]:
training = load_training()

In [None]:
X = assemble_X(training, weather)
mean, std = normalize(X)
y = assemble_y(training)

In [None]:
training

In [None]:
X[0].shape

In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(200, activation='relu', input_shape=X[0].shape),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(200, activation='relu'),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

opt = tf.keras.optimizers.SGD(
    learning_rate=0.01, momentum=0.0, nesterov=False, name="SGD",
)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
model.summary()

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.001)

In [None]:
model.fit(x=X_train, y=y_train, epochs=60)

In [None]:
model.predict_proba(X_train)

In [None]:
print("train:", metrics.roc_auc_score(y_train, model.predict_proba(X_train)))

print("valid:", metrics.roc_auc_score(y_valid, model.predict_proba(X_valid)))

# Loading Test set

In [None]:
len(list(zip(testing, predictions)))

In [None]:
testing = load_testing()
X_testing = assemble_X(testing, weather) 
normalize(X_testing, mean, std)
predictions = model.predict_proba(X_testing)[:,0]  

df_out = pd.DataFrame(np.array([[row['Id'], p] for row, p in zip(testing, predictions)]), columns=["Id","WnvPresent"])

df_out.to_csv('west_nile_v5_pd.csv', index=False)

In [None]:
df_out = pd.DataFrame(np.array([[row['Id'], p] for row, p in zip(testing, predictions)]), columns=["Id","WnvPresent"])

df_out.to_csv('west_nile_v5_pd.csv', index=False)