# Purpose

To make a really bad (or straight out of the box) model. Quickly identify unsignificant features and have a benchmark to improve

In [323]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

In [324]:
df = pd.read_csv("../data/clean/data.csv")

In [325]:
NUM_FEATURES = ["year_on_production", "footage_lateral_length", "proppant_volume", "total_number_of_stages", "azimuth", "isip", "porosity", "proppant_fluid_ratio", "pump_rate", "tvd_ft"]

CAT_FEATURES = ["treatment_company", "operator"]

In [326]:
median_imputer = SimpleImputer(
    strategy="mean"
)

In [327]:
df_num_features = pd.DataFrame(median_imputer.fit_transform(df[NUM_FEATURES]), columns=NUM_FEATURES)

In [328]:
one_hot_encoder = OneHotEncoder(
    sparse_output=False,
    handle_unknown="infrequent_if_exist",
    min_frequency=20
)

In [329]:
one_hot_encoded = one_hot_encoder.fit_transform(df[CAT_FEATURES])

# Extract column names for one-hot encoded features
column_names = one_hot_encoder.get_feature_names_out(CAT_FEATURES)

# Create DataFrame with one-hot encoded features and column names
df_one_hot_encoded = pd.DataFrame(one_hot_encoded, columns=column_names)

In [330]:
df_one_hot_encoded

Unnamed: 0,treatment_company_treatment_company_1,treatment_company_treatment_company_10,treatment_company_treatment_company_11,treatment_company_treatment_company_12,treatment_company_treatment_company_16,treatment_company_treatment_company_2,treatment_company_treatment_company_3,treatment_company_treatment_company_4,treatment_company_treatment_company_5,treatment_company_treatment_company_7,...,operator_operator_20,operator_operator_25,operator_operator_26,operator_operator_4,operator_operator_5,operator_operator_6,operator_operator_7,operator_operator_8,operator_operator_9,operator_infrequent_sklearn
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [331]:
df_production = df["production"]

In [332]:
# df_train = pd.concat([df_num_features, df_one_hot_encoded, df_production], axis=1)
df_train = pd.concat([df_num_features, df_production], axis=1)

In [333]:
df_train.head()

Unnamed: 0,year_on_production,footage_lateral_length,proppant_volume,total_number_of_stages,azimuth,isip,porosity,proppant_fluid_ratio,pump_rate,tvd_ft,production
0,2018.0,11966.0,21568792.0,56.0,-32.279999,4149.0,0.02,1.23,83.0,6443.0,5614.947951
1,2014.0,6890.0,9841307.0,33.0,-19.799999,5776.0,0.17,1.47,102.0,7602.0,2188.836707
2,2018.0,8793.0,17116240.0,62.0,-26.879999,4628.0,0.02,1.67,88.0,5907.0,1450.033022
3,2012.0,4234.0,3749559.0,11.0,-49.099998,4582.0,0.03,0.77,100.0,6538.0,1060.764407
4,2012.0,2972.0,6690705.0,9.0,5.56,4909.0,0.02,1.32,94.0,7024.0,607.530385


In [334]:
X = df_train.drop(["production"], axis=1)
y = df["production"]

In [335]:
df_train.shape

(1000, 11)

In [336]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=200, random_state=42)

In [337]:
rf = RandomForestRegressor(n_jobs=-1, random_state=42)

In [338]:
hyperparameters = {
    "n_estimators": [50, 100, 200, 500],
    "criterion": ["squared_error", "absolute_error", "friedman_mse"],
    "max_depth": [None, 10, 20, 30],
    "max_features": ["sqrt", "log2"]
}

In [339]:
rf_cv = GridSearchCV(
    estimator=rf,
    param_grid=hyperparameters,
    n_jobs=-1,
    scoring="neg_mean_absolute_error"
    
)

In [340]:
rf_cv.fit(X_train, y_train)

In [None]:
pd.DataFrame(rf_cv.cv_results_).to_csv("results.csv", index=False)

In [None]:
rf_cv.best_params_

{'criterion': 'friedman_mse',
 'max_depth': 20,
 'max_features': 'log2',
 'n_estimators': 500}

In [None]:
rf_cv.best_score_

-560.7902126500434

In [None]:
winner = rf_cv.best_estimator_

In [None]:
feature_importances = winner.feature_importances_.round(2)

In [None]:
features = X_train.columns

In [None]:
df_features = pd.DataFrame(zip(features, feature_importances), columns=["feature", "Importance"]).sort_values(by="Importance", ascending=False)
df_features[df_features["Importance"]>0]

Unnamed: 0,feature,Importance
2,proppant_volume,0.15
3,total_number_of_stages,0.12
1,footage_lateral_length,0.1
9,tvd_ft,0.09
25,operator_operator_14,0.08
0,year_on_production,0.07
5,isip,0.06
8,pump_rate,0.05
7,proppant_fluid_ratio,0.05
4,azimuth,0.05


In [None]:
df_features

Unnamed: 0,feature,Importance
2,proppant_volume,0.15
3,total_number_of_stages,0.12
1,footage_lateral_length,0.1
9,tvd_ft,0.09
25,operator_operator_14,0.08
0,year_on_production,0.07
5,isip,0.06
8,pump_rate,0.05
7,proppant_fluid_ratio,0.05
4,azimuth,0.05
