<a href="https://colab.research.google.com/github/stefisha/JobFairChallenge2022/blob/main/NordeusChallenge2022.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
import numpy as np
import os
import joblib

In [27]:
# Load the dataset
file_path = '/content/drive/MyDrive/JobFairChallenge2022/2. job_fair_retention_prediction_2022_training.csv'
data = pd.read_csv(file_path)

In [28]:
data.head()

Unnamed: 0,date,registration_type,played_t11_before,registration_channel,network_type,device_tier,device_type,device_manufacturer,screen_dpi,device_memory_size_mb,device_model,os_version,registrations,returned
0,2022-06-02,3,0,1,1,3,1,2,320.0,2820.0,Xiaomi Redmi 7,Android OS 10 / API-29,1,1
1,2022-06-02,1,2,1,1,4,1,0,480.0,2740.0,OMIX OMIX X300,Android OS 11 / API-30,1,0
2,2022-06-02,3,0,1,1,3,1,1,315.0,3571.0,samsung SM-M215F,Android OS 11 / API-30,1,0
3,2022-06-02,1,2,2,1,2,1,10,320.0,3774.0,TECNO MOBILE LIMITED TECNO KC3,Android OS 9 / API-28,1,1
4,2022-06-02,1,2,2,1,1,1,1,320.0,889.0,samsung SM-A013G,Android OS 10 / API-29,1,0


In [29]:
# Check for missing values in the dataset
missing_values = data.isnull().sum()

In [30]:
missing_values

date                        0
registration_type           0
played_t11_before           0
registration_channel        0
network_type                0
device_tier                 0
device_type                 0
device_manufacturer         0
screen_dpi               1890
device_memory_size_mb    1890
device_model             1890
os_version               1890
registrations               0
returned                    0
dtype: int64

In [31]:
# Impute missing values
# For numerical columns
for col in ['screen_dpi', 'device_memory_size_mb']:
    median_value = data[col].median()
    data[col].fillna(median_value, inplace=True)

In [32]:
# For categorical columns
for col in ['device_model', 'os_version']:
    data[col].fillna('Unknown', inplace=True)

In [33]:
# Convert 'date' column to datetime
data['date'] = pd.to_datetime(data['date'])

In [34]:
# Advanced Feature Engineering
data['day_of_week'] = data['date'].dt.dayofweek
data['is_weekend'] = data['day_of_week'].isin([5, 6]).astype(int)
data['month'] = data['date'].dt.month
data['week_of_year'] = data['date'].dt.isocalendar().week

In [35]:
# Preparing the features and target variable
X = data.drop(['date', 'returned'], axis=1)
y = data['returned']

In [36]:
# Splitting the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [37]:
# Identifying categorical and numerical columns
categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [38]:
# Creating a ColumnTransformer for transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

In [39]:
# Apply transformations to the training set
X_train_transformed = preprocessor.fit_transform(X_train)

In [40]:
# Apply the same transformations to the validation set
X_val_transformed = preprocessor.transform(X_val)

In [41]:
X_train_transformed_dense = X_train_transformed.toarray()

In [42]:
# Check for NaN values in the transformed training set
if np.isnan(X_train_transformed_dense).any():
    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    X_train_transformed = imputer.fit_transform(X_train_transformed_dense)
else:
    X_train_transformed = X_train_transformed_dense

In [43]:
# Apply the same transformations and imputation (if necessary) to the validation set
X_val_transformed_dense = X_val_transformed.toarray()
if np.isnan(X_val_transformed_dense).any():
    X_val_transformed = imputer.transform(X_val_transformed_dense)
else:
    X_val_transformed = X_val_transformed_dense

In [44]:
# # Hyperparameter Tuning for Random Forest with a smaller grid
# param_grid = {
#     'n_estimators': [100, 150],  # Reduced number of estimators
#     'max_depth': [10, 15],       # Shallower trees
#     'min_samples_split': [2, 4],
#     'min_samples_leaf': [1, 2]
# }

In [45]:
# # Set joblib to use a specific directory for temporary files
# joblib_temp_folder = "/path/to/large/disk/space"  # Set this to a path with more space
# os.environ['JOBLIB_TEMP_FOLDER'] = joblib_temp_folder

In [46]:
# # Proceed with the grid search
# rf_regressor = RandomForestRegressor(random_state=42)
# grid_search = GridSearchCV(estimator=rf_regressor, param_grid=param_grid, cv=3, n_jobs=1, verbose=2)
# grid_search.fit(X_train_transformed, y_train)

In [47]:
# # Best parameters
# best_params = grid_search.best_params_
# print("Best parameters:", best_params)

In [48]:
# # Train the final model with best parameters
# final_rf_regressor = RandomForestRegressor(**best_params, random_state=42)
# final_rf_regressor.fit(X_train_transformed, y_train)


In [49]:
rf_regressor = RandomForestRegressor(
    n_estimators=5,       # Reduced number of trees SET BACK TO 50
    max_depth=10,          # Limiting the depth of trees
    min_samples_leaf=4,    # More samples per leaf
    n_jobs=-1,             # Use all available cores
    random_state=42,
    verbose=2              # More detailed updates
)
rf_regressor.fit(X_train_transformed, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.


building tree 1 of 5building tree 2 of 5

building tree 3 of 5
building tree 4 of 5
building tree 5 of 5


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  7.4min finished


In [50]:
# Predict and evaluate
y_val_pred = rf_regressor.predict(X_val_transformed)
rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
print("Validation RMSE:", rmse)

Validation RMSE: 0.5215973363629842


[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    0.0s finished


In [51]:
# Filter for the prediction period (dates after 2022-08-31)
prediction_period = data[data['date'] >= '2022-08-31']

In [52]:
# Transform features for prediction
X_prediction_period = prediction_period.drop(['date', 'returned'], axis=1)
X_prediction_transformed = preprocessor.transform(X_prediction_period)


In [53]:
# If imputer was used
if np.isnan(X_prediction_transformed.toarray()).any():
    X_prediction_transformed = imputer.transform(X_prediction_transformed.toarray())


In [55]:
# Predict 'returned' values
predicted_returned = rf_regressor.predict(X_prediction_transformed)


[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed:    0.1s finished


In [56]:
# Calculate Retention Day 1 percentages
predicted_retention_d1 = (predicted_returned / prediction_period['registrations']) * 100


In [57]:
# Create submission DataFrame
submission_df = pd.DataFrame({
    'date': prediction_period['date'],
    'retention_d1': np.round(predicted_retention_d1, 4)
})

In [60]:
# Save to CSV
submission_file_path = '/content/drive/MyDrive/JobFairChallenge2022/retention_d1_predictions.csv'  # Adjust the path as necessary
submission_df.to_csv(submission_file_path, index=False, date_format='%Y-%m-%d', float_format='%.4f')


In [61]:
print(f"Submission saved to {submission_file_path}")

Submission saved to /content/drive/MyDrive/JobFairChallenge2022/retention_d1_predictions.csv


In [1]:
# from xgboost import XGBRegressor

# xgb_regressor = XGBRegressor(
#     n_estimators=100,
#     max_depth=6,  # Similar to RF, but often shallower works well
#     learning_rate=0.1,  # Default learning rate
#     verbosity=1,  # Verbose output
#     random_state=42,
#     n_jobs=-1  # Use all available cores
# )

# xgb_regressor.fit(X_train_transformed, y_train, eval_metric='rmse', verbose=True)