In [1]:
import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/dengai-predicting-disease-spread/dengue_features_test.csv
/kaggle/input/dengai-predicting-disease-spread/dengue_features_train.csv
/kaggle/input/dengai-predicting-disease-spread/submission_format.csv
/kaggle/input/dengai-predicting-disease-spread/dengue_labels_train.csv


In [2]:
X = pd.read_csv('../input/dengai-predicting-disease-spread/dengue_features_train.csv')
y = pd.read_csv('../input/dengai-predicting-disease-spread/dengue_labels_train.csv')
X_test = pd.read_csv('../input/dengai-predicting-disease-spread/dengue_features_test.csv')
if ('week_start_date' in X.columns): 
    X = X.drop(['week_start_date'], axis=1) 
    X_test = X_test.drop(['week_start_date'], axis=1)

In [3]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

    
num_cols = [col for col in X.columns if X[col].dtypes in ['int64', 'float64']]
cat_cols = [col for col in X.columns if X[col].dtypes in ['object']]
num_transformer = Pipeline(steps=[('standardscaler', StandardScaler()), 
                                  ('impute', SimpleImputer(strategy='most_frequent'))])
cat_transformer = Pipeline(steps=[('impute', SimpleImputer(strategy='most_frequent')), 
                                  ('onehot', OneHotEncoder(handle_unknown='ignore'))])
preprocessor = ColumnTransformer(transformers = [('num_transformer', num_transformer, num_cols), 
                                                 ('cat_transformer', cat_transformer, cat_cols)])
xgb = XGBRegressor(n_estimators = 1000, 
                     learning_rate=0.01, 
                     max_depth=5, 
                     subsample=0.8,
                     colsample_bytree=1, 
                     gamma=1)
rf = RandomForestRegressor(n_estimators=100)
pipeline = Pipeline(steps=[('preprocessor', preprocessor), 
                           ('model', rf)])
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

scores = -1*cross_val_score(pipeline, X, y['total_cases'], 
                            cv=5, 
                            scoring='neg_mean_absolute_error')
print(scores.mean())

19.44691305371181


In [4]:
pipeline.fit(X, y['total_cases'])
preds = pipeline.predict(X_test)
preds_int = preds.round(0).astype(int)
output = pd.DataFrame({'city': X_test['city'], 
                       'year': X_test['year'],
                       'weekofyear': X_test['weekofyear'], 
                       'total_cases': preds_int})
output.to_csv('submission.csv', index=False)