# **Installation**

In [11]:
import time
start_time = time.time()

In [12]:
try:
    import kagglehub
except ImportError:
    !pip install -q kaggle


In [13]:
try:
    import cudf
    import cuml
except ImportError:
    !pip install \
        --extra-index-url=https://pypi.nvidia.com \
        cudf-cu12==24.10.* cuml-cu12==24.10.*


In [14]:
try:
    import category_encoders
except ImportError:
    !pip install category_encoders


# **Imports**

In [15]:
%load_ext cudf.pandas
import pandas as pd
from category_encoders import TargetEncoder

import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

pd.pandas.set_option('display.max_columns',None)

The cudf.pandas extension is already loaded. To reload it, use:
  %reload_ext cudf.pandas


# Load Kaggle Dataset with:

In [16]:
import os
import zipfile

# Define the file and download paths
file_path = '/content/odsc_2024_nvidia_hackathon/train.csv'
download_path = '/content/odsc_2024_nvidia_hackathon'
odsc_2024_nvidia_hackathon_path = download_path

# Execute only if train.csv is not present
if not os.path.exists(file_path):
    # Hardcode Kaggle credentials
    os.environ['KAGGLE_USERNAME'] = 'sushantshelar13'
    os.environ['KAGGLE_KEY'] = 'fcb90c109b271a8001a1e4982bb53dc9'

    # Ensure download directory exists
    os.makedirs(download_path, exist_ok=True)

    # Download the dataset
    # try:
    #     import kagglehub
    # except ImportError:
    !kaggle competitions download -c odsc-2024-nvidia-hackathon -p {download_path}

    # Find and extract the downloaded zip file
    for file in os.listdir(download_path):
        if file.endswith('.zip'):
            with zipfile.ZipFile(os.path.join(download_path, file), 'r') as zip_ref:
                zip_ref.extractall(download_path)

    # Store the path in a variable
    odsc_2024_nvidia_hackathon_path = download_path
    # print('Data source import complete and files extracted.')


# **Load Data** - Used pandas with cudf for parallel processing

In [17]:
train_x = pd.read_csv(f"{odsc_2024_nvidia_hackathon_path}/train.csv")[['trickortreat', 'kingofhalloween', 'mumming', 'noon', 'costumes',
       'celebrate', 'confectionary', 'predestinated', 'leprechaun', 'headless',
       'jacksantino', 'mystical', 'folklore', 'tarantula', 'paranormal',
       'northernlights', 'quirky', 'ectoplasm', 'jester', 'nightmare',
       'isolation', 'cobwebs', 'sweets', 'duskhour', 'bedford', 'propitiation',
       'ritesofpassage', 'kiddies', 'ghost', 'batman', 'applebobbing',
       'magical', 'h2o', 'xray', 'cackle', 'mummy', 'easter', 'fest',
       'washday', 'wardrobe', 'moonlight', 'theme', 'moon', 'week', 'parades',
       'imprisonment', 'worktime', 'haunt', 'frankenstein', 'october',
       'scoobydoo', 'rip', 'somerset', 'extravaganza', 'fortunetelling',
       'dracula', 'divination', 'kidnap', 'funeral', 'ween', 'nonconformism',
       'mischievous', 'mondays', 'batwings', 'fear', 'daysail', 'krakatoa',
       'egg', 'tshirt', 'night', 'apotropaicmagic', 'periodoftime', 'almanac',
       'dayspring', 'fortnight', 'dungeon', 'flagday', 'leapweek', 'ancestors',
       'lurking', 'sympatheticmagic', 'toys', 'lunacy', 'bewitching',
       'enchanting', 'casket', 'centiday', 'fire', 'interday', 'fogmachine',
       'holi', 'churchyard', 'monsterhunter', 'tabulatable', 'vampire','hallo','y']]

# **`Data Analysis`**

# **Feature Engineering**

In [18]:
train_y = train_x['y']
train_x = train_x.drop(columns = ['y'])

# **Filling missing values with mean for numerical and most frequent for categorical**

In [19]:
# Identify numerical columns (non-object types)
numerical_data_train = train_x.select_dtypes(include=['number']).columns.tolist()

# Identify categorical columns (object types)
categorical_data_train = train_x.select_dtypes(include=['object']).columns.tolist()

# Initialize an empty dictionary to store fill values
fill_dict = {}

# Define a function to calculate the mean for numeric columns
def mean_value(col):
    return train_x[col].mean()

# Fill missing values for numerical columns
for col in numerical_data_train:
    if pd.api.types.is_numeric_dtype(train_x[col]):
        fill_dict[col] = mean_value(col)  # Only numeric columns
    else:
        print(f"Warning: {col} is not numeric. Skipping mean calculation.")

# Fill missing values for categorical columns with their corresponding frequencies
for col in categorical_data_train:
    frequency_map = train_x[col].value_counts()  # Get frequency of each category
    fill_dict[col] = frequency_map  # Store the frequency map in the fill dictionary

# Apply the fill dictionary to the DataFrame
for col, fill_value in fill_dict.items():
    if col in numerical_data_train:
        train_x[col].fillna(fill_value, inplace=True)
    elif col in categorical_data_train:
        # Replace categories with their corresponding frequencies
        train_x[col] = train_x[col].map(fill_value).fillna(fill_value.max())

# **Build Model**

(n_estimators = 400, max_depth=5) - Choosen after hyper parameter tunning

In [None]:
# Configure XGBRegressor with GPU support
Xgb = XGBRegressor(n_estimators = 400, max_depth=5, random_state  = 0, tree_method = "hist", device = "cuda")
Xgb.fit(train_x, train_y)


# **Prediction on train data**

In [None]:
pred_train =  Xgb.predict(train_x)

In [None]:

rmse = np.sqrt(mean_squared_error(train_y.to_numpy(), pred_train))
print("Train Root Mean Squared Error:", rmse)


# Testing

In [None]:
test = pd.read_csv(f"{odsc_2024_nvidia_hackathon_path}/test.csv")
test_x =test[['trickortreat', 'kingofhalloween', 'mumming', 'noon', 'costumes',
       'celebrate', 'confectionary', 'predestinated', 'leprechaun', 'headless',
       'jacksantino', 'mystical', 'folklore', 'tarantula', 'paranormal',
       'northernlights', 'quirky', 'ectoplasm', 'jester', 'nightmare',
       'isolation', 'cobwebs', 'sweets', 'duskhour', 'bedford', 'propitiation',
       'ritesofpassage', 'kiddies', 'ghost', 'batman', 'applebobbing',
       'magical', 'h2o', 'xray', 'cackle', 'mummy', 'easter', 'fest',
       'washday', 'wardrobe', 'moonlight', 'theme', 'moon', 'week', 'parades',
       'imprisonment', 'worktime', 'haunt', 'frankenstein', 'october',
       'scoobydoo', 'rip', 'somerset', 'extravaganza', 'fortunetelling',
       'dracula', 'divination', 'kidnap', 'funeral', 'ween', 'nonconformism',
       'mischievous', 'mondays', 'batwings', 'fear', 'daysail', 'krakatoa',
       'egg', 'tshirt', 'night', 'apotropaicmagic', 'periodoftime', 'almanac',
       'dayspring', 'fortnight', 'dungeon', 'flagday', 'leapweek', 'ancestors',
       'lurking', 'sympatheticmagic', 'toys', 'lunacy', 'bewitching',
       'enchanting', 'casket', 'centiday', 'fire', 'interday', 'fogmachine',
       'holi', 'churchyard', 'monsterhunter', 'tabulatable', 'vampire',
       'hallo']]




# numerical_data_train = testx.select_dtypes(include=['number']).columns.tolist()

In [None]:
for col, fill_value in fill_dict.items():
    if col in numerical_data_train:
        test_x[col].fillna(fill_value, inplace=True)
    elif col in categorical_data_train:
        # Replace categories with their corresponding frequencies
        test_x[col] = test_x[col].map(fill_value).fillna(fill_value.max())

In [None]:
y_test_pred= Xgb.predict(test_x)

# **Create Submission file**

In [None]:
# Step 2: Create the submission DataFrame
submission = pd.DataFrame({ 'id': test['id'].apply(int), 'y': y_test_pred })
# Step 3: Save the submission file as a CSV
submission.to_csv('sample_submission.csv', index=False)

In [None]:
total_time_taken = time.time() - start_time
print(f"Total time taken: {total_time_taken:.2f} seconds")