# Imports

In [57]:
import pickle
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Data

In [77]:
df_test = pd.read_csv('../input/test.csv')
df_train = pd.read_csv('../input/train.csv')
df_weather = pd.read_csv('../riordan/data/weather_cleaned.csv')

# Datetime

In [78]:
df_test['Date'] = pd.to_datetime(df_test['Date'])
df_test.set_index('Date',inplace=True)

df_train['Date'] = pd.to_datetime(df_train['Date'])
df_train.set_index('Date',inplace=True)

df_weather['Date'] = pd.to_datetime(df_weather['Date'])
df_weather.set_index('Date',inplace=True)

# Rolling Weather

In [79]:
def random_roll(weather_cols):
    for w in weather_cols:
        days = np.random.choice(range(3,30),1)[0]
        df_weather[w+'_roll_'+str(days)] = df_weather[w].rolling(days).mean()

In [80]:
random_roll(['ResultSpeed','PrecipTotal','DewPoint','AvgSpeed','Heat','Tmax','Tmin'])

In [81]:
# df_weather['ResultSpeed_21'] = df_weather['ResultSpeed'].rolling(21).mean()
# df_weather['PrecipTotal_15'] = df_weather['PrecipTotal'].rolling(15).sum()
# df_weather['DewPoint_16'] = df_weather['DewPoint'].rolling(16).mean()
# df_weather['AvgSpeed_19'] = df_weather['AvgSpeed'].rolling(19).mean()
# df_weather['Heat_28'] = df_weather['Heat'].rolling(28).mean()
# df_weather['Tmax_4'] = df_weather['Tmax'].rolling(4).mean()
# df_weather['Tmin_8'] = df_weather['Tmin'].rolling(8).mean()

# Round Lat & Long

In [82]:
df_train['Lat_int'] = df_train['Latitude'].apply(int)
df_train['Long_int'] = df_train['Longitude'].apply(int)
df_test['Lat_int'] = df_test['Latitude'].apply(int)
df_test['Long_int'] = df_test['Longitude'].apply(int)

# Merge

In [83]:
df_train = pd.merge(left=df_train,right=df_weather,on='Date')
df_test = pd.merge(left=df_test,right=df_weather,on='Date')

# Drops

In [84]:
df_train.drop(['Station', 'Tmax', 'Tmin', 'Tavg', 'DewPoint', 'WetBulb', 'Heat',
       'Cool', 'PrecipTotal', 'StnPressure', 'SeaLevel', 'ResultSpeed',
       'ResultDir', 'AvgSpeed','Address', 'Block','Latitude','Longitude',
         'AddressNumberAndStreet','AddressAccuracy','NumMosquitos'],axis=1,inplace=True)

df_test.drop(['Id','Station', 'Tmax', 'Tmin', 'Tavg', 'DewPoint', 'WetBulb', 'Heat',
       'Cool', 'PrecipTotal', 'StnPressure', 'SeaLevel', 'ResultSpeed',
       'ResultDir', 'AvgSpeed','Address', 'Block','Latitude','Longitude',
         'AddressNumberAndStreet','AddressAccuracy'],axis=1,inplace=True)

# Binary Species

In [85]:
no_wn = ['CULEX ERRATICUS','CULEX SALINARIUS','CULEX TARSALIS','CULEX TERRITANS']
yes_wn = ['CULEX PIPIENS/RESTUANS', 'CULEX RESTUANS', 'CULEX PIPIENS']

df_train['Species'] = df_train['Species'].map(lambda x: 1 if x in yes_wn else 0)
df_test['Species'] = df_test['Species'].map(lambda x: 1 if x in yes_wn else 0)

# Label Encoder

In [86]:
from sklearn import preprocessing

# Convert categorical data to numbers
lbl = preprocessing.LabelEncoder()
# lbl.fit(list(df['Species'].values) + list(df1['Species'].values))
# df['Species'] = lbl.transform(df['Species'].values)
# df1['Species'] = lbl.transform(df1['Species'].values)

lbl.fit(list(df_train['Street'].values) + list(df_test['Street'].values))
df_train['Street'] = lbl.transform(df_train['Street'].values)
df_test['Street'] = lbl.transform(df_test['Street'].values)

lbl.fit(list(df_train['Trap'].values) + list(df_test['Trap'].values))
df_train['Trap'] = lbl.transform(df_train['Trap'].values)
df_test['Trap'] = lbl.transform(df_test['Trap'].values)

# Final Drops

In [87]:
# df_train.drop(['Species','Lat_int', 'Long_int'],axis=1,inplace=True)
# df_test.drop(['Species','Lat_int', 'Long_int'],axis=1,inplace=True)

# Model

In [72]:
X = df_train.drop('WnvPresent',axis=1)
y = df_train['WnvPresent']

In [91]:
ss = StandardScaler()
pca = PCA(random_state=3,n_components=1)
rf = RandomForestClassifier(n_estimators=1000,n_jobs=3)

In [92]:
X_train_ss = ss.fit_transform(X_train)
X_test_ss = ss.transform(df_test)

X_train_pca = pca.fit_transform(X_train_ss)
X_test_pca = pca.transform(X_test_ss)

rf.fit(X_train_pca,y_train);

test_preds = rf.predict_proba(X_test_pca)

In [93]:
submit = pd.read_csv('../input/sampleSubmission.csv')
submit['WnvPresent'] = 1-test_preds
submit.to_csv('lbl_rm.csv',index=False)

In [76]:
features = pd.DataFrame({'Feature':X.columns,'Weight':rf.feature_importances_})
features

ValueError: arrays must all be same length

In [None]:
#features.to_csv('0.71717.csv',index=False)