In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error, r2_score
from sklearn.preprocessing import LabelEncoder

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')

In [None]:
# Convert 'date' to datetime and extract features
train['date'] = pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])

for df in [train, test]:
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day

In [None]:
train.shape

(230130, 9)

In [None]:
train.head(10)

Unnamed: 0,id,date,country,store,product,num_sold,year,month,day
0,0,2010-01-01,Canada,Discount Stickers,Holographic Goose,,2010,1,1
1,1,2010-01-01,Canada,Discount Stickers,Kaggle,973.0,2010,1,1
2,2,2010-01-01,Canada,Discount Stickers,Kaggle Tiers,906.0,2010,1,1
3,3,2010-01-01,Canada,Discount Stickers,Kerneler,423.0,2010,1,1
4,4,2010-01-01,Canada,Discount Stickers,Kerneler Dark Mode,491.0,2010,1,1
5,5,2010-01-01,Canada,Stickers for Less,Holographic Goose,300.0,2010,1,1
6,6,2010-01-01,Canada,Stickers for Less,Kaggle,1837.0,2010,1,1
7,7,2010-01-01,Canada,Stickers for Less,Kaggle Tiers,1659.0,2010,1,1
8,8,2010-01-01,Canada,Stickers for Less,Kerneler,807.0,2010,1,1
9,9,2010-01-01,Canada,Stickers for Less,Kerneler Dark Mode,940.0,2010,1,1


In [None]:
target_column = 'num_sold'
X = train.drop(columns=[target_column, 'date'])
y = train[target_column]

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)  # Adjust test_size and random_state as needed


In [None]:
# Check for missing or invalid values using pandas isnull
print("X_train has NaN:", X_train.isnull().any().any())
print("X_val has NaN:", X_val.isnull().any().any())
print("y_train has NaN:", y_train.isnull().any())
print("y_val has NaN:", y_val.isnull().any())

X_train has NaN: False
X_val has NaN: False
y_train has NaN: True
y_val has NaN: True


In [None]:
y_train = np.nan_to_num(y_train)
y_val = np.nan_to_num(y_val)
X_train = np.nan_to_num(X_train)

In [None]:
# Encode categorical variables
categorical_cols = ['country', 'store', 'product']
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])
    label_encoders[col] = le

In [None]:
# Encode categorical variables in X_train and X_val
categorical_cols = ['country', 'store', 'product']
# Convert X_train back to DataFrame to enable column indexing
X_train = pd.DataFrame(X_train, columns=X.columns)  # Assuming X still holds original column names
# ---OR--- if X has been modified, explicitly set the column names:
# X_train = pd.DataFrame(X_train, columns=['id', 'country', 'store', 'product', 'year', 'month', 'day'])

for col in categorical_cols:
    # Use the label encoder fitted on the training data to transform both train and validation sets
    le = label_encoders[col]  # Get the pre-fitted LabelEncoder for this column
    X_train[col] = le.transform(X_train[col])
    X_val[col] = le.transform(X_val[col])

# If you need X_train as a NumPy array later, you can convert it back:
#X_train = X_train.to_numpy()

In [None]:
rf = RandomForestRegressor(random_state=42)


In [None]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [2],
    'min_samples_leaf': [1]
}


In [None]:
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_error')


In [None]:
grid_search.fit(X_train, y_train)


In [None]:


# Access the best estimator using `best_estimator_`:
best_rf = grid_search.best_estimator_


In [None]:
# Assuming 'test' is your original test DataFrame
X_test = test.drop(columns=['date'])  # Remove the 'date' column

# Encode categorical variables in X_test using the same label encoders
categorical_cols = ['country', 'store', 'product']
for col in categorical_cols:
    le = label_encoders[col]  # Use the pre-fitted LabelEncoder
    # Handle unseen labels by assigning them a new category or a default value
    X_test[col] = X_test[col].map(lambda s: le.transform([s])[0] if s in le.classes_ else -1) # assign -1 to unseen values

# Now you can use best_rf for predictions:
y_pred = best_rf.predict(X_test)

In [None]:

# Now you can use best_rf for predictions:
y_pred = best_rf.predict(X_test)

In [None]:
# Calculate metrics on the validation set
y_pred_val = best_rf.predict(X_val)  # Get predictions for the validation set
mape = mean_absolute_percentage_error(y_val, y_pred_val)  # Use y_val instead of y_test
r2 = r2_score(y_val, y_pred_val)  # Use y_val instead of y_test

In [None]:
print("Mean Absolute Percentage Error (MAPE):", mape)
print("R² Score:", r2)

Mean Absolute Percentage Error (MAPE): 2216613486477160.0
R² Score: 0.9772574231543224


In [None]:
# Prepare the test data for prediction (ensure it matches the train data format)
# Drop any non-feature columns (like 'id', 'date') that aren't used in training
X_test = test.drop(columns=[ "date", "num_sold"], errors="ignore")  # Ignore missing columns to avoid errors

# Make predictions on the test dataset
test["num_sold"] = best_rf.predict(X_test)

# Prepare the submission file in the required format
submission_df = test[["id", "num_sold"]]
submission_df.to_csv("submission.csv", index=False)

print("Submission file 'submission.csv' has been successfully created!")


Submission file 'submission.csv' has been successfully created!


In [None]:
# prompt: whats the sample of submission? i mean head(10)

sample_submission.head(10)

Unnamed: 0,id,num_sold
0,230130,100
1,230131,100
2,230132,100
3,230133,100
4,230134,100
5,230135,100
6,230136,100
7,230137,100
8,230138,100
9,230139,100
