<a href="https://colab.research.google.com/github/ravijp/WiDS/blob/main/WiDS_Setup.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install --upgrade --force-reinstall --no-deps kaggle

Collecting kaggle
  Downloading kaggle-1.5.12.tar.gz (58 kB)
[?25l[K     |█████▋                          | 10 kB 24.5 MB/s eta 0:00:01[K     |███████████▏                    | 20 kB 28.1 MB/s eta 0:00:01[K     |████████████████▊               | 30 kB 32.4 MB/s eta 0:00:01[K     |██████████████████████▎         | 40 kB 36.4 MB/s eta 0:00:01[K     |███████████████████████████▉    | 51 kB 40.1 MB/s eta 0:00:01[K     |████████████████████████████████| 58 kB 5.4 MB/s 
[?25hBuilding wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25l[?25hdone
  Created wheel for kaggle: filename=kaggle-1.5.12-py3-none-any.whl size=73051 sha256=17afc809188cf0a43a75041f342b776f38eb47a20ca1f31a1d9dd9f22bc3277c
  Stored in directory: /root/.cache/pip/wheels/62/d6/58/5853130f941e75b2177d281eb7e44b4a98ed46dd155f556dc5
Successfully built kaggle
Installing collected packages: kaggle
  Attempting uninstall: kaggle
    Found existing installation: kaggle 1.5.12
    U

# Setup Colab for kaggle

In [2]:
import json
from pathlib import Path
import shutil
import subprocess
import sys
import os
from google.colab import data_table
data_table.enable_dataframe_formatter()
# # data_table.disable_dataframe_formatter()

INPUT_FOLDER = Path("/kaggle/input/widsdatathon2022/")
OUTPUT_FOLDER = Path("/kaggle/output/")
WORK_FOLDER = Path("/kaggle/working/")

def install_package(package_name):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])

def dump_dataset_metadata(user_name, dataset_name, folder_path):
    with open(Path(folder_path) / "dataset-metadata.json", "w") as f:
        json.dump({
            "title": dataset_name,
            "id": f"{user_name}/{dataset_name}",
            "licenses": [{ "name": "CC0-1.0" }]
        }, f, indent=4)

def is_running_in_colab(check_env=True):
    if not check_env:
        return True
    running_in_colab = "google.colab" in str(get_ipython())
    print(f"Running in Colab: {running_in_colab}")
    return running_in_colab

def setup_colab_drive_for_kaggle(check_env=True):
    if not is_running_in_colab(check_env):
        return False

    from google.colab import drive
    drive.mount("/content/drive")

    return True  # Is Colab

def setup_colab_secrets_for_kaggle(check_env=True):
    if not is_running_in_colab(check_env):
        return False

    drive_sources_dir = Path("/content/drive/MyDrive/kaggle")

    # Set up kaggle.json to access Kaggle data.
    if (drive_sources_dir / "kaggle.json").exists():
        print(f'kaggle.json file found at {drive_sources_dir / "kaggle.json"}')
        kaggle_config = Path.home() / ".kaggle"
        if kaggle_config.exists():
            shutil.rmtree(kaggle_config)
        kaggle_config.mkdir()
        (kaggle_config / "kaggle.json").symlink_to(drive_sources_dir / "kaggle.json")
        print(f"Content of Kaggle config dir ({kaggle_config}): {list(map(str, kaggle_config.iterdir()))}")

    if (drive_sources_dir / ".env").exists():
        install_package("python-dotenv")
        from dotenv import load_dotenv, dotenv_values
        load_dotenv(dotenv_path=drive_sources_dir / ".env", verbose=True, override=True)
        nonempty_keys = [key for key, val in dotenv_values(drive_sources_dir / '.env').items() if val]
        print(f"Loaded environment variables from .env file: {nonempty_keys}.")

    return True  # Is Colab

def setup_colab_directories_for_kaggle(check_env=True, local_working=False):
    if not is_running_in_colab(check_env):
        return False

    # Only add "working" directory if it was requested to be mapped in Drive, not in local env.
    target_content_dirs = ["input", "output"] + ([] if local_working else ["working"])

    drive_content_dir = Path("/content/drive/MyDrive/kaggle")
    # Make sure directories are present in Drive
    drive_content_dir.mkdir(exist_ok=True)
    for content_dir in target_content_dirs:
        (drive_content_dir / content_dir).mkdir(exist_ok=True)
    print(f"Content of Drive Kaggle data dir ({drive_content_dir}): {list(map(str, drive_content_dir.iterdir()))}")

    kaggle_dir = Path("/kaggle")
    if kaggle_dir.exists():
        shutil.rmtree(kaggle_dir)
    kaggle_dir.mkdir()

    for content_dir in target_content_dirs:
        (kaggle_dir / content_dir).symlink_to(drive_content_dir / content_dir)

    # It was requested not to map working to Drive, so create it locally.
    if local_working:
        (kaggle_dir / "working").mkdir()

    print(f"Content of Kaggle data dir ({kaggle_dir}): {list(map(str, kaggle_dir.iterdir()))}")
    for content_dir in target_content_dirs + (["working"] if local_working else []):
        print(f"Content of Kaggle data subdir ({kaggle_dir / content_dir}): {list(map(str, (kaggle_dir / content_dir).iterdir()))}")

    return True  # Is Colab

def setup_colab_for_kaggle(check_env=True, local_working=False):
    if not is_running_in_colab(check_env):
        return False

    setup_colab_drive_for_kaggle(check_env=False)
    setup_colab_directories_for_kaggle(check_env=False, local_working=local_working)
    setup_colab_secrets_for_kaggle(check_env=False)

    return True  # Is Colab


In [3]:
import traceback
try:
    from google.colab import drive
    drive.mount("/content/drive")
    %cd /content/drive/MyDrive/Colab\ Notebooks
    setup_colab_for_kaggle(check_env=False, local_working=True)
except Exception as e:
    tb = traceback.format_exc()
    print("Not in Colab", tb)

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Content of Drive Kaggle data dir (/content/drive/MyDrive/kaggle): ['/content/drive/MyDrive/kaggle/kaggle.json', '/content/drive/MyDrive/kaggle/.ipynb_checkpoints', '/content/drive/MyDrive/kaggle/input', '/content/drive/MyDrive/kaggle/output']
Content of Kaggle data dir (/kaggle): ['/kaggle/working', '/kaggle/input', '/kaggle/output']
Content of Kaggle data subdir (/kaggle/input): ['/kaggle/input/widsdatathon2022', '/kaggle/input/.ipynb_checkpoints']
Content of Kaggle data subdir (/kaggle/output): ['/kaggle/output/.ipynb_checkpoints']
Content of Kaggle data subdir (/kaggle/working): []
kaggle.json file found at /content/drive/MyDrive/kaggle/kaggle.json
Content of Kaggle config dir (/root/.kaggle): ['/root/.kaggle/kaggle.json']


# Unzip Gdrive


# Download Competition files

Location : /kaggle/input/widsdatathon2022 folder. 

In [4]:
!kaggle competitions files widsdatathon2022

name                 size  creationDate         
-------------------  ----  -------------------  
train.csv            26MB  2021-12-16 19:06:07  
sample_solution.csv  95KB  2021-12-16 19:06:07  
test.csv              3MB  2021-12-16 19:06:07  


In [5]:
!kaggle competitions download widsdatathon2022  -p /kaggle/input/widsdatathon2022/

Downloading widsdatathon2022.zip to /kaggle/input/widsdatathon2022
  0% 0.00/1.82M [00:00<?, ?B/s]
100% 1.82M/1.82M [00:00<00:00, 55.1MB/s]


In [12]:
chd = os.getcwd()
os.chdir(INPUT_FOLDER)
!unzip widsdatathon2022.zip
!rm widsdatathon2022.zip

# !unzip test.csv.zip
# !rm test.csv.zip
# !unzip train.csv.zip
# !rm train.csv.zip 
os.chdir(chd)

Archive:  widsdatathon2022.zip
replace sample_solution.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: sample_solution.csv     
  inflating: test.csv                
  inflating: train.csv               
unzip:  cannot find or open test.csv.zip, test.csv.zip.zip or test.csv.zip.ZIP.
rm: cannot remove 'test.csv.zip': No such file or directory
unzip:  cannot find or open train.csv.zip, train.csv.zip.zip or train.csv.zip.ZIP.
rm: cannot remove 'train.csv.zip': No such file or directory


In [13]:
os.chdir("/content/drive/MyDrive/kaggle")
os.chdir(chd)
print(os.getcwd())

/content/drive/MyDrive/kaggle/input/widsdatathon2022


In [14]:
# !git init
# !ls -a
# !rm -rf .git

# Modelling

Kaggle kernel code

## Imports

In [15]:
import pandas as pd
from sklearn import preprocessing
import numpy as np
import lightgbm as lgb
import gc
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
pd.options.display.max_columns=100
pd.options.display.max_rows=100


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/widsdatathon2022/sample_solution.csv
/kaggle/input/widsdatathon2022/test.csv
/kaggle/input/widsdatathon2022/train.csv
/kaggle/input/widsdatathon2022/train_edd.csv
/kaggle/input/widsdatathon2022/test_edd.csv


## EDA

In [16]:
df_train=pd.read_csv('/kaggle/input/widsdatathon2022/train.csv')

df_test=pd.read_csv('/kaggle/input/widsdatathon2022/test.csv')
print(len(df_train), len(df_test))
print("NULL Values:>\n")
print(df_train.isnull().mean())
print(df_train['site_eui'].describe())
print("df_train['Year_Factor'].value_counts():> \n", df_train['Year_Factor'].value_counts())
print("df_test['Year_Factor'].value_counts():> \n", df_test['Year_Factor'].value_counts())
print("df_test.isnull().mean():> \n", df_test.isnull().mean())

75757 9705
NULL Values:>

Year_Factor                  0.000000
State_Factor                 0.000000
building_class               0.000000
facility_type                0.000000
floor_area                   0.000000
year_built                   0.024249
energy_star_rating           0.352561
ELEVATION                    0.000000
january_min_temp             0.000000
january_avg_temp             0.000000
january_max_temp             0.000000
february_min_temp            0.000000
february_avg_temp            0.000000
february_max_temp            0.000000
march_min_temp               0.000000
march_avg_temp               0.000000
march_max_temp               0.000000
april_min_temp               0.000000
april_avg_temp               0.000000
april_max_temp               0.000000
may_min_temp                 0.000000
may_avg_temp                 0.000000
may_max_temp                 0.000000
june_min_temp                0.000000
june_avg_temp                0.000000
june_max_temp           

In [None]:
data_table.DataTable(df_train, max_columns=100, max_rows=6)



Unnamed: 0,Year_Factor,State_Factor,building_class,facility_type,floor_area,year_built,energy_star_rating,ELEVATION,january_min_temp,january_avg_temp,january_max_temp,february_min_temp,february_avg_temp,february_max_temp,march_min_temp,march_avg_temp,march_max_temp,april_min_temp,april_avg_temp,april_max_temp,may_min_temp,may_avg_temp,may_max_temp,june_min_temp,june_avg_temp,june_max_temp,july_min_temp,july_avg_temp,july_max_temp,august_min_temp,august_avg_temp,august_max_temp,september_min_temp,september_avg_temp,september_max_temp,october_min_temp,october_avg_temp,october_max_temp,november_min_temp,november_avg_temp,november_max_temp,december_min_temp,december_avg_temp,december_max_temp,cooling_degree_days,heating_degree_days,precipitation_inches,snowfall_inches,snowdepth_inches,avg_temp,days_below_30F,days_below_20F,days_below_10F,days_below_0F,days_above_80F,days_above_90F,days_above_100F,days_above_110F,direction_max_wind_speed,direction_peak_wind_speed,max_wind_speed,days_with_fog,site_eui,id
0,1,State_1,Commercial,Grocery_store_or_food_market,61242.0,1942.0,11.0,2.4,36,50.500000,68,35,50.589286,73,40,53.693548,80,41,55.500000,78,46,56.854839,84,50,60.500000,90,52,62.725806,84,52,62.161290,85,52,64.650000,90,47,63.016129,83,43,53.800000,72,36,49.274194,71,115,2960,16.59,0.0,0,56.972603,0,0,0,0,14,0,0,0,1.0,1.0,1.0,,248.682615,0
1,1,State_1,Commercial,Warehouse_Distribution_or_Shipping_center,274000.0,1955.0,45.0,1.8,36,50.500000,68,35,50.589286,73,40,53.693548,80,41,55.500000,78,46,56.854839,84,50,60.500000,90,52,62.725806,84,52,62.161290,85,52,64.650000,90,47,63.016129,83,43,53.800000,72,36,49.274194,71,115,2960,16.59,0.0,0,56.972603,0,0,0,0,14,0,0,0,1.0,,1.0,12.0,26.500150,1
2,1,State_1,Commercial,Retail_Enclosed_mall,280025.0,1951.0,97.0,1.8,36,50.500000,68,35,50.589286,73,40,53.693548,80,41,55.500000,78,46,56.854839,84,50,60.500000,90,52,62.725806,84,52,62.161290,85,52,64.650000,90,47,63.016129,83,43,53.800000,72,36,49.274194,71,115,2960,16.59,0.0,0,56.972603,0,0,0,0,14,0,0,0,1.0,,1.0,12.0,24.693619,2
3,1,State_1,Commercial,Education_Other_classroom,55325.0,1980.0,46.0,1.8,36,50.500000,68,35,50.589286,73,40,53.693548,80,41,55.500000,78,46,56.854839,84,50,60.500000,90,52,62.725806,84,52,62.161290,85,52,64.650000,90,47,63.016129,83,43,53.800000,72,36,49.274194,71,115,2960,16.59,0.0,0,56.972603,0,0,0,0,14,0,0,0,1.0,,1.0,12.0,48.406926,3
4,1,State_1,Commercial,Warehouse_Nonrefrigerated,66000.0,1985.0,100.0,2.4,36,50.500000,68,35,50.589286,73,40,53.693548,80,41,55.500000,78,46,56.854839,84,50,60.500000,90,52,62.725806,84,52,62.161290,85,52,64.650000,90,47,63.016129,83,43,53.800000,72,36,49.274194,71,115,2960,16.59,0.0,0,56.972603,0,0,0,0,14,0,0,0,1.0,1.0,1.0,,3.899395,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75752,6,State_11,Commercial,Office_Uncategorized,20410.0,1995.0,8.0,36.6,28,43.451613,56,34,47.672414,64,35,49.354839,71,40,56.266667,89,45,59.193548,88,47,63.566667,91,54,66.951613,87,52,68.612903,91,44,61.383333,79,39,55.403226,68,40,51.283333,71,24,38.532258,51,260,3772,45.03,1.4,1,55.147541,6,0,0,0,25,3,0,0,,,,,132.918411,75752
75753,6,State_11,Residential,5plus_Unit_Building,40489.0,1910.0,98.0,36.6,28,43.451613,56,34,47.672414,64,35,49.354839,71,40,56.266667,89,45,59.193548,88,47,63.566667,91,54,66.951613,87,52,68.612903,91,44,61.383333,79,39,55.403226,68,40,51.283333,71,24,38.532258,51,260,3772,45.03,1.4,1,55.147541,6,0,0,0,25,3,0,0,,,,,39.483672,75753
75754,6,State_11,Commercial,Commercial_Other,28072.0,1917.0,,36.6,26,36.612903,48,30,41.637931,58,29,41.338710,62,34,50.183333,79,40,52.145161,79,41,56.233333,86,48,58.758065,81,48,61.612903,87,41,53.783333,73,37,47.661290,59,34,44.650000,62,15,30.338710,46,55,6218,106.32,36.6,438,47.911202,26,2,0,0,6,0,0,0,,,,,48.404398,75754
75755,6,State_11,Commercial,Commercial_Other,53575.0,2012.0,,36.6,26,36.612903,48,30,41.637931,58,29,41.338710,62,34,50.183333,79,40,52.145161,79,41,56.233333,86,48,58.758065,81,48,61.612903,87,41,53.783333,73,37,47.661290,59,34,44.650000,62,15,30.338710,46,55,6218,106.32,36.6,438,47.911202,26,2,0,0,6,0,0,0,,,,,592.022750,75755


In [None]:
data_table.DataTable(df_train.describe(), max_columns=100, max_rows=6)



Unnamed: 0,Year_Factor,floor_area,year_built,energy_star_rating,ELEVATION,january_min_temp,january_avg_temp,january_max_temp,february_min_temp,february_avg_temp,february_max_temp,march_min_temp,march_avg_temp,march_max_temp,april_min_temp,april_avg_temp,april_max_temp,may_min_temp,may_avg_temp,may_max_temp,june_min_temp,june_avg_temp,june_max_temp,july_min_temp,july_avg_temp,july_max_temp,august_min_temp,august_avg_temp,august_max_temp,september_min_temp,september_avg_temp,september_max_temp,october_min_temp,october_avg_temp,october_max_temp,november_min_temp,november_avg_temp,november_max_temp,december_min_temp,december_avg_temp,december_max_temp,cooling_degree_days,heating_degree_days,precipitation_inches,snowfall_inches,snowdepth_inches,avg_temp,days_below_30F,days_below_20F,days_below_10F,days_below_0F,days_above_80F,days_above_90F,days_above_100F,days_above_110F,direction_max_wind_speed,direction_peak_wind_speed,max_wind_speed,days_with_fog,site_eui,id
count,75757.0,75757.0,73920.0,49048.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,75757.0,34675.0,33946.0,34675.0,29961.0,75757.0,75757.0
mean,4.367755,165983.9,1952.306764,61.048605,39.506323,11.432343,34.310468,59.054952,11.720567,35.526837,58.486278,21.606281,44.469292,70.897831,32.037171,53.784863,82.661932,44.706179,63.742547,88.987856,51.131763,71.066384,91.40028,60.571604,76.596879,95.54395,58.515068,75.348749,92.572581,50.791689,69.597545,92.171139,38.210911,58.727038,81.135671,28.630397,48.123133,71.202397,22.736777,41.625569,64.496852,1202.250446,4324.95739,42.430651,29.136379,164.862455,56.176705,48.756511,17.447932,4.886532,0.876764,82.709809,14.058701,0.279539,0.002442,66.552675,62.779974,4.190601,109.142051,82.584693,37878.0
std,1.471441,246875.8,37.053619,28.663683,60.656596,9.381027,6.996108,5.355458,12.577272,8.866697,8.414611,10.004303,6.657142,7.680982,5.577279,2.500473,4.101028,4.200319,2.792919,2.864669,3.415926,2.861681,3.604068,4.177109,4.090275,4.157118,4.441992,3.818895,4.139685,4.290884,3.603773,5.011708,5.093899,2.6413,5.579513,7.153244,4.094207,4.373374,9.014047,5.869386,6.054104,391.56178,824.366727,10.592205,17.810894,175.275869,2.24194,27.587124,14.469435,7.071221,2.894244,25.282913,10.943996,2.252323,0.14214,131.147834,130.308106,6.458789,50.699751,58.255403,21869.306509
min,1.0,943.0,0.0,0.0,-6.4,-19.0,10.806452,42.0,-13.0,13.25,38.0,-9.0,25.854839,53.0,15.0,40.28,62.0,23.0,46.95,64.0,30.0,54.1,67.0,37.0,54.822581,65.0,31.0,56.693548,66.0,26.0,53.6,64.0,18.0,44.693548,59.0,4.0,30.766667,53.0,-16.0,23.790323,42.0,0.0,398.0,0.0,0.0,0.0,44.512329,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,12.0,1.001169,0.0
25%,3.0,62379.0,1927.0,40.0,11.9,6.0,29.827586,56.0,2.0,31.625,55.0,13.0,38.096774,62.0,26.0,52.333333,80.0,42.0,62.790323,88.0,50.0,70.983333,89.0,57.0,76.145161,95.0,57.0,74.516129,90.0,47.0,68.2,91.0,35.0,58.048387,78.0,23.0,45.25,69.0,17.0,38.532258,60.0,1128.0,3978.0,39.35,9.6,10.0,55.147541,22.0,5.0,0.0,0.0,72.0,6.0,0.0,0.0,1.0,1.0,1.0,88.0,54.528601,18939.0
50%,5.0,91367.0,1951.0,67.0,25.0,11.0,34.451613,59.0,9.0,34.107143,61.0,25.0,44.516129,71.0,32.0,53.333333,82.0,45.0,62.887097,89.0,52.0,72.25,90.0,63.0,78.677419,96.0,61.0,75.725806,91.0,52.0,69.733333,92.0,38.0,58.758065,80.0,31.0,47.116667,72.0,24.0,40.516129,65.0,1277.0,4337.0,42.17,31.6,101.0,56.835616,50.0,11.0,2.0,0.0,84.0,12.0,0.0,0.0,1.0,1.0,1.0,104.0,75.293716,37878.0
75%,6.0,166000.0,1977.0,85.0,42.7,13.0,37.322581,62.0,20.0,40.87931,62.0,27.0,49.354839,78.0,37.0,54.766667,87.0,49.0,65.096774,91.0,53.0,72.45,94.0,63.0,78.790323,98.0,61.0,78.967742,95.0,54.0,71.8,96.0,40.0,59.629032,85.0,34.0,51.339286,74.0,28.0,43.435484,71.0,1489.0,4670.0,46.32,49.1,377.0,57.215847,66.0,26.0,7.0,0.0,97.0,17.0,0.0,0.0,1.0,1.0,1.0,131.0,97.277534,56817.0
max,6.0,6385382.0,2015.0,100.0,1924.5,49.0,64.758065,91.0,48.0,65.107143,89.0,52.0,69.758065,95.0,52.0,74.5,104.0,58.0,82.112903,112.0,68.0,89.55,119.0,74.0,94.435484,117.0,77.0,94.903226,116.0,65.0,90.116667,111.0,61.0,80.741935,108.0,52.0,67.416667,96.0,44.0,61.790323,86.0,4948.0,7929.0,107.69,127.3,1292.0,77.245205,170.0,93.0,59.0,31.0,260.0,185.0,119.0,16.0,360.0,360.0,23.3,311.0,997.86612,75756.0


In [None]:
df_train.nunique().sort_values().to_csv('/content/uniq.csv')

In [None]:
df_train['days_with_fog'].plot()

KeyboardInterrupt: ignored

Error in callback <function flush_figures at 0x7f6ee43b8050> (for post_execute):


KeyboardInterrupt: ignored

## Baseline lgb model

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df
df_train = reduce_mem_usage(df_train)
df_test = reduce_mem_usage(df_test)

In [None]:
def entrena_lgb(data,test,features,categorical,target):

    kfold=GroupKFold(n_splits=6)


    i=1

    r=[]
    
    pred_test=np.zeros(len(test))

    importancias=pd.DataFrame()

    importancias['variable']=features
    
    
    cat_ind=[features.index(x) for x in categorical if x in features]
    
    dict_cat={}
    
    categorical_numerical = data[categorical].dropna().select_dtypes(include=np.number).columns.tolist()
    
    categorical_transform=[x for x in categorical if x not in categorical_numerical]
    
    for l in categorical_transform:
        le = preprocessing.LabelEncoder()
        le.fit(list(data[l].dropna())+list(test[l].dropna()))

        dict_cat[l]=le

        data.loc[~data[l].isnull(),l]=le.transform(data.loc[~data[l].isnull(),l])
        test.loc[~test[l].isnull(),l]=le.transform(test.loc[~test[l].isnull(),l])
        
        

    for train_index,test_index in kfold.split(data,data[target],data['Year_Factor']):

        lgb_data_train = lgb.Dataset(data.loc[train_index,features].values,data.loc[train_index,target].values)
        lgb_data_eval = lgb.Dataset(data.loc[test_index,features].values,data.loc[test_index,target].values, reference=lgb_data_train)

        # params = {
        #     'task': 'train',
        #     'boosting_type': 'gbdt',
        #     'objective': 'regression',
        #     'metric': { 'rmse'},
        #     "max_depth":-1,
        #     "num_leaves":32,
        #     'learning_rate': 0.1,
        # "min_child_samples": 100,
        #     'feature_fraction': 0.9,
        #  "bagging_freq":1,
        #     'bagging_fraction': 0.9,
        #     "lambda_l1":10,
        #     "lambda_l2":10,
        #    # "scale_pos_weight":30,
        #     'min_data_per_group':500,

        #     'verbose': 1    
        # }




        # modelo = lgb.train(params,lgb_data_train,num_boost_round=13100,valid_sets=lgb_data_eval,early_stopping_rounds=50,verbose_eval=25,categorical_feature=cat_ind)
        params = {
            'task': 'train',
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metric': { 'rmse'},
            #'num_iterations':5000,
            #'max_bin':5395,
            "max_depth":12,
            "num_leaves":52,
            # 'learning_rate': 0.873791759,
            'learning_rate': 0.5,
        #"min_child_samples": 100,
            #'feature_fraction': 0.9,
         #"bagging_freq":1,
            #'bagging_fraction': 0.9,
            #"lambda_l1":10,
            #"lambda_l2":10,
           # "scale_pos_weight":30,
            #'min_data_per_group':500,

            'verbose': 1    
        }

        modelo = lgb.train(params, lgb_data_train, num_boost_round=95100, valid_sets=lgb_data_eval, early_stopping_rounds=100, verbose_eval=25, categorical_feature=cat_ind)

        importancias['gain_'+str(i)]=modelo.feature_importance(importance_type="gain")


        data.loc[test_index,'estimator']=modelo.predict(data.loc[test_index,features].values, num_iteration=modelo.best_iteration)
        
        pred_test=pred_test+modelo.predict(test[features].values, num_iteration=modelo.best_iteration)

        print ("Fold_"+str(i))
        a= (mean_squared_error(data.loc[test_index,target],data.loc[test_index,'estimator']))**0.5
        r.append(a)
        print (a)
        print ("")

        i=i+1
        
    for l in categorical_transform:

            data.loc[~data[l].isnull(),l]=dict_cat[l].inverse_transform(data.loc[~data[l].isnull(),l].astype(int))
            
            test.loc[~test[l].isnull(),l]=dict_cat[l].inverse_transform(test.loc[~test[l].isnull(),l].astype(int))
            
    importancias["gain_avg"]=importancias[["gain_1","gain_2","gain_3","gain_4","gain_5"]].mean(axis=1)
    importancias=importancias.sort_values("gain_avg",ascending=False).reset_index(drop=True)
    
    pred_test=(pred_test/6)
    
    
    oof=(mean_squared_error(data[target],data['estimator']))**0.5
    
    print (oof)
    print ("mean: "+str(np.mean(np.array(r))))
    print ("std: "+str(np.std(np.array(r))))
    
    dict_resultados={}
    
    dict_resultados['importancias']=importancias
    
    dict_resultados['predicciones']=pred_test
    
    
    
    return dict_resultados

In [None]:
no_usar=['site_eui','id']

target='site_eui'

categorical=['Year_Factor','State_Factor','building_class','facility_type']

features=[x for x in df_train.columns if x not in no_usar]



### train 1

In [None]:
dict_resultados=entrena_lgb(data=df_train,test=df_test,features=features,categorical=categorical,target=target)

In [None]:

dict_resultados['importancias']

In [None]:
dict_resultados['importancias']['variable'].tolist()

In [None]:
temp=dict_resultados['importancias']

features_selected=temp['variable'].tolist()[0:4]



### train 2

In [None]:
dict_resultados_2=entrena_lgb(data=df_train,test=df_test,features=features_selected,categorical=categorical,target=target)

In [None]:
dict_resultados_2['importancias']

In [None]:
df_train.groupby([ 'State_Factor', 'building_class', 'facility_type',
       'floor_area', 'year_built', 'energy_star_rating', 'ELEVATION']).size()

In [None]:
df_train.groupby([ 'State_Factor', 'building_class', 'facility_type',
       'floor_area', 'year_built', 'energy_star_rating', 'ELEVATION']).size().value_counts()

In [None]:
df_train.groupby([ 'State_Factor', 'building_class', 'facility_type',
       'floor_area', 'year_built',  'ELEVATION']).size().value_counts()

In [None]:

features_selected_2=list(features_selected)

features_selected_2.extend(['State_Factor'])

### train 3

In [None]:
dict_resultados_3=entrena_lgb(data=df_train,test=df_test,features=features_selected_2,categorical=categorical,target=target)

In [None]:
features_selected_2

In [None]:


variables=['facility_type',
 'energy_star_rating',
 'year_built',
# 'floor_area',
 'State_Factor']

df_train['combination_variables']=df_train[variables].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)

df_test['combination_variables']=df_test[variables].apply(lambda row: '_'.join(row.values.astype(str)), axis=1)

In [None]:
features_selected_3=list(features_selected_2)

features_selected_3.extend(['combination_variables'])

categorical.extend(['combination_variables'])

### train 4

In [None]:
dict_resultados_4=entrena_lgb(data=df_train,test=df_test,features=features_selected_3,categorical=categorical,target=target)

In [None]:
dict_resultados_4['importancias']

In [None]:
# df_test['site_eui']=(dict_resultados_2['predicciones'].copy()+dict_resultados_3['predicciones'].copy())/2
df_test['site_eui']=(dict_resultados_4['predicciones'].copy()+ dict_resultados_2['predicciones'].copy()+dict_resultados_3['predicciones'].copy())/3

In [None]:
df_test[['id','site_eui']]

## Submission file

In [None]:
df_test[['id','site_eui']].to_csv(f'{WORK_FOLDER}/submission.csv',index=False)

# Submit to Kaggle

In [None]:
# !kaggle competitions submit \
#      widsdatathon2022 \
#     -f {WORK_FOLDER}/submission.csv \
#     -m "First Colab Baseline Attempt"
!kaggle competitions submit \
     widsdatathon2022 \
    -f {WORK_FOLDER}/submission.csv \
    -m "Playing with Existing LGB Solution v1"

100% 230k/230k [00:06<00:00, 37.1kB/s]
Successfully submitted to WiDS Datathon 2022

## Check Score

In [None]:
!kaggle competitions submissions widsdatathon2022

fileName        date                 description                            status    publicScore  privateScore  
--------------  -------------------  -------------------------------------  --------  -----------  ------------  
submission.csv  2022-02-18 12:33:58  Playing with Existing LGB Solution v1  complete  25.236       None          
submission.csv  2022-02-17 15:54:31  First Colab Baseline Attempt           complete  29.001       None          
