In [1]:
import pandas as pd
import numpy as np
import sklearn as skl
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('Resources/features_and_target.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19771 entries, 0 to 19770
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   LAT               19771 non-null  float64
 1   LON               19771 non-null  float64
 2   WELL_DEPTH        19771 non-null  float64
 3   UNDER_5           19771 non-null  int64  
 4   CURRENT_STANDARD  19771 non-null  int64  
 5   1942_STANDARD     19771 non-null  int64  
dtypes: float64(3), int64(3)
memory usage: 926.9 KB


In [3]:
X = df[['LAT', 'LON']]
X = X.to_numpy()
y = df[['UNDER_5']].values.reshape(-1, 1)
y_cur = df[['CURRENT_STANDARD']].values.reshape(-1, 1)
y_1942 = df[['1942_STANDARD']].values.reshape(-1, 1)

In [4]:
df['UNDER_5'].value_counts()

0    15502
1     4269
Name: UNDER_5, dtype: int64

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)
X_cur_train, X_cur_test, y_cur_train, y_cur_test = train_test_split(X, y_cur, random_state=42, stratify=y_cur)
X_1942_train, X_1942_test, y_1942_train, y_1942_test = train_test_split(X, y_1942, random_state=42, stratify=y_1942)

In [6]:
classifier = LogisticRegression(solver='lbfgs', random_state=42)
classifier_cur = LogisticRegression(solver='lbfgs', random_state=42)
classifier_1942 = LogisticRegression(solver='lbfgs', random_state=42)

classifier.fit(X_train, y_train)
classifier_cur.fit(X_cur_train, y_cur_train)
classifier_1942.fit(X_1942_train, y_1942_train)

LogisticRegression(random_state=42)

In [7]:
predictions = classifier.predict(X_test)
predictions_cur = classifier.predict(X_cur_test)
predictions_1942 = classifier_1942.predict(X_1942_test)

In [8]:
print(f'{accuracy_score(y_test, predictions)}')
print(f'{accuracy_score(y_cur_test, predictions_cur)}')
print(f'{accuracy_score(y_1942_test, predictions_1942)}')

0.7841391867287073
0.8792231438397734
0.9805785959943354


In [9]:
with open('Resources/Trained_Models/under5.pkl','wb') as f:
    pickle.dump(classifier,f)
with open('Resources/Trained_Models/current.pkl','wb') as f:
    pickle.dump(classifier_cur,f)
with open('Resources/Trained_Models/oldStandard.pkl','wb') as f:
    pickle.dump(classifier_1942,f)