# Purpose

To make a really bad (or straight out of the box) model. Quickly identify unsignificant features and have a benchmark to improve

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv("../data/clean/data.csv")

In [3]:
df.columns

Index(['treatment_company', 'azimuth', 'md_ft', 'tvd_ft', 'date_on_production',
       'operator', 'footage_lateral_length', 'well_spacing',
       'porpoise_deviation', 'porpoise_count', 'shale_footage',
       'acoustic_impedance', 'log_permeability', 'porosity', 'poisson_ratio',
       'water_saturation', 'toc', 'vcl', 'p_velocity', 's_velocity',
       'youngs_modulus', 'isip', 'breakdown_pressure', 'pump_rate',
       'total_number_of_stages', 'proppant_volume', 'proppant_fluid_ratio',
       'production', 'difference_in_days', 'difference_in_weeks',
       'difference_in_years', 'log_production'],
      dtype='object')

In [4]:
df.head()

Unnamed: 0,treatment_company,azimuth,md_ft,tvd_ft,date_on_production,operator,footage_lateral_length,well_spacing,porpoise_deviation,porpoise_count,...,breakdown_pressure,pump_rate,total_number_of_stages,proppant_volume,proppant_fluid_ratio,production,difference_in_days,difference_in_weeks,difference_in_years,log_production
0,treatment_company_1,-32.279999,19148,6443.0,2018-03-01,operator_1,11966.0,4368.4629,6.33,12,...,,83,56,21568792.0,1.23,5614.947951,2178,311.142857,5.967123,8.633188
1,treatment_company_2,-19.799999,15150,7602.0,2014-07-01,operator_2,6890.0,4714.9922,1.28,4,...,,102,33,9841307.0,1.47,2188.836707,3517,502.428571,9.635616,7.691125
2,treatment_company_3,-26.879999,14950,5907.0,2018-08-01,operator_1,8793.0,798.92096,2.03,6,...,,88,62,17116240.0,1.67,1450.033022,2025,289.285714,5.547945,7.279342
3,treatment_company_4,-49.099998,11098,6538.0,2012-01-01,operator_1,4234.0,,6.0,23,...,,100,11,3749559.0,0.77,1060.764407,4429,632.714286,12.134247,6.966745
4,treatment_company_5,5.56,10549,7024.0,2012-01-01,operator_3,2972.0,2967.563,11.87,9,...,,94,9,6690705.0,1.32,607.530385,4429,632.714286,12.134247,6.409402


In [5]:
NUM_FEATURES = [
    'azimuth', 'md_ft', 'tvd_ft', 'footage_lateral_length', 'well_spacing',
    'porpoise_deviation', 'porpoise_count', 'shale_footage',
    'acoustic_impedance', 'log_permeability', 'porosity', 'poisson_ratio',
    'water_saturation', 'toc', 'vcl', 'p_velocity', 's_velocity',
    'youngs_modulus', 'isip', 'breakdown_pressure', 'pump_rate',
    'total_number_of_stages', 'proppant_volume', 'proppant_fluid_ratio',
    'difference_in_days', 'difference_in_weeks',
    'difference_in_years'
]

CAT_FEATURES = ["treatment_company", "operator"]

In [6]:
median_imputer = SimpleImputer(
    strategy="median"
)

In [7]:
df_num_features = pd.DataFrame(median_imputer.fit_transform(df[NUM_FEATURES]), columns=NUM_FEATURES)

In [10]:
one_hot_encoder = OneHotEncoder(
    sparse_output=False,
    handle_unknown="infrequent_if_exist"
)

In [15]:
one_hot_encoded = one_hot_encoder.fit_transform(df[CAT_FEATURES])

# Extract column names for one-hot encoded features
column_names = one_hot_encoder.get_feature_names_out(CAT_FEATURES)

# Create DataFrame with one-hot encoded features and column names
df_one_hot_encoded = pd.DataFrame(one_hot_encoded, columns=column_names)

In [16]:
df_one_hot_encoded

Unnamed: 0,treatment_company_treatment_company_1,treatment_company_treatment_company_10,treatment_company_treatment_company_11,treatment_company_treatment_company_12,treatment_company_treatment_company_13,treatment_company_treatment_company_14,treatment_company_treatment_company_15,treatment_company_treatment_company_16,treatment_company_treatment_company_17,treatment_company_treatment_company_18,...,operator_operator_33,operator_operator_34,operator_operator_35,operator_operator_36,operator_operator_4,operator_operator_5,operator_operator_6,operator_operator_7,operator_operator_8,operator_operator_9
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
df_production = df["production"]

In [21]:
df_train = pd.concat([df_num_features, df_one_hot_encoded, df_production], axis=1)

In [22]:
df_train.head()

Unnamed: 0,azimuth,md_ft,tvd_ft,footage_lateral_length,well_spacing,porpoise_deviation,porpoise_count,shale_footage,acoustic_impedance,log_permeability,...,operator_operator_34,operator_operator_35,operator_operator_36,operator_operator_4,operator_operator_5,operator_operator_6,operator_operator_7,operator_operator_8,operator_operator_9,production
0,-32.279999,19148.0,6443.0,11966.0,4368.4629,6.33,12.0,1093.0,30123.2,0.68,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5614.947951
1,-19.799999,15150.0,7602.0,6890.0,4714.9922,1.28,4.0,0.0,30951.61,1.85,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2188.836707
2,-26.879999,14950.0,5907.0,8793.0,798.92096,2.03,6.0,3254.0,28900.25,0.29,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1450.033022
3,-49.099998,11098.0,6538.0,4234.0,1999.29525,6.0,23.0,7470.0,32826.08,0.73,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1060.764407
4,5.56,10549.0,7024.0,2972.0,2967.563,11.87,9.0,3637.0,26740.05,0.47,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,607.530385


In [26]:
X = df_train.drop(["production"], axis=1)
y = df["production"]

In [27]:
df_train.shape

(1000, 95)

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=200, random_state=42)

In [34]:
rf = RandomForestRegressor(n_jobs=-1)

In [35]:
rf.fit(X_train, y_train)

In [36]:
print(rf.score(X_train, y_train))
print(rf.score(X_test, y_test))

0.9495360781408461
0.6872656663898467


In [45]:
df_importances = pd.DataFrame(rf.feature_importances_.round(3), columns=["Importance"])

In [57]:
hyperparameters = {
    "n_estimators": [50, 100, 200, 500],
    "criterion": ["squared_error", "absolute_error", "friedman_mse"],
    "max_depth": [None, 10, 20, 30],
    "max_features": ["sqrt", "log2"]
}

In [58]:
rf_cv = GridSearchCV(
    estimator=rf,
    param_grid=hyperparameters,
    n_jobs=-1
)

In [59]:
rf_cv.fit(X_train, y_train)

In [60]:
rf_cv.cv_results_

{'mean_fit_time': array([0.11815939, 0.18735676, 0.3043808 , 0.9089601 , 0.14314103,
        0.13600407, 0.27381816, 0.74298077, 0.13502727, 0.12854013,
        0.31827364, 0.83995714, 0.08582106, 0.12900047, 0.24426703,
        0.65497518, 0.0843812 , 0.13466783, 0.27249684, 0.80082951,
        0.1085763 , 0.12831626, 0.27009406, 0.71784048, 0.09951096,
        0.12601147, 0.28324685, 0.82839565, 0.13413033, 0.12598939,
        0.26654763, 0.98738837, 0.45580997, 0.91313472, 2.43726659,
        6.71049032, 0.42781553, 1.10484934, 2.0517838 , 5.32912855,
        0.53139181, 1.20757623, 2.20321093, 6.21926064, 0.37604671,
        0.96359129, 1.71919899, 4.48466072, 0.65553551, 1.34724689,
        2.47929921, 6.82443371, 0.43708382, 1.01461434, 2.04252067,
        5.2801651 , 0.56438994, 1.38856878, 2.45000534, 6.88122792,
        0.50279279, 1.04838505, 2.03209786, 4.5798727 , 0.16306586,
        0.35231185, 0.74740276, 1.09774237, 0.08666124, 0.12720065,
        0.25981431, 0.70829968,

In [61]:
rf_cv.best_params_

{'criterion': 'absolute_error',
 'max_depth': None,
 'max_features': 'sqrt',
 'n_estimators': 500}

In [62]:
rf_cv.best_score_

0.6160188134221486

In [64]:
winner = rf_cv.best_estimator_

In [68]:
feature_importances = winner.feature_importances_.round(2)

In [70]:
features = X_train.columns

In [74]:
df_features = pd.DataFrame(zip(features, feature_importances), columns=["feature", "Importance"]).sort_values(by="Importance", ascending=False)
df_features[df_features["Importance"]>0]

Unnamed: 0,feature,Importance
22,proppant_volume,0.07
1,md_ft,0.06
21,total_number_of_stages,0.06
2,tvd_ft,0.05
3,footage_lateral_length,0.05
15,p_velocity,0.04
26,difference_in_years,0.04
25,difference_in_weeks,0.04
24,difference_in_days,0.04
17,youngs_modulus,0.04
