<b>Import liabraries and load datasets</b>

In [184]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [185]:
data = pd.read_csv('Age _Fossil.csv')
data

Unnamed: 0,uranium_lead_ratio,carbon_14_ratio,radioactive_decay_series,stratigraphic_layer_depth,geological_period,paleomagnetic_data,inclusion_of_other_fossils,isotopic_composition,surrounding_rock_type,stratigraphic_position,fossil_size,fossil_weight,age
0,0.738061,0.487707,0.907884,91.17,Cretaceous,Normal polarity,False,0.915951,Conglomerate,Middle,50.65,432.00,43523
1,0.560096,0.341738,1.121302,165.44,Cambrian,Normal polarity,False,0.803968,Limestone,Top,48.85,353.29,44112
2,0.424773,0.218493,0.103855,218.98,Cambrian,Normal polarity,True,0.792441,Shale,Bottom,37.66,371.33,43480
3,0.349958,0.704649,0.383617,51.09,Permian,Normal polarity,True,0.074636,Limestone,Bottom,39.10,232.84,30228
4,0.886811,0.777494,0.593254,313.72,Devonian,Normal polarity,True,1.646640,Shale,Top,90.84,277.67,67217
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4393,0.049660,0.601750,0.762490,222.54,Jurassic,Reversed polarity,True,2.247495,Sandstone,Bottom,91.69,415.13,26606
4394,0.360085,0.215033,1.002406,276.70,Cretaceous,Reversed polarity,True,1.004584,Conglomerate,Bottom,68.97,121.10,44850
4395,0.464864,0.553313,0.659639,76.77,Devonian,Normal polarity,True,0.721947,Conglomerate,Middle,11.37,288.73,32186
4396,0.803338,0.272392,0.123562,204.82,Neogene,Reversed polarity,True,1.496427,Sandstone,Bottom,132.34,518.31,59888


<b>Exploratory Data Analysis</b>

In [186]:
data.shape

(4398, 13)

In [187]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
uranium_lead_ratio,4398.0,0.486938,0.257132,0.000241,0.294091,0.473715,0.658012,1.53327
carbon_14_ratio,4398.0,0.430116,0.233536,0.000244,0.246269,0.416046,0.590752,1.0
radioactive_decay_series,4398.0,0.532054,0.287929,7.6e-05,0.316775,0.51183,0.723805,1.513325
stratigraphic_layer_depth,4398.0,152.832801,86.272234,0.13,85.39,146.0,211.9325,494.2
isotopic_composition,4398.0,0.938315,0.521643,0.000275,0.533519,0.905684,1.289307,3.071434
fossil_size,4398.0,70.70246,37.352584,0.13,42.4875,68.785,96.44,216.39
fossil_weight,4398.0,326.655177,187.032333,0.62,181.0475,308.26,455.52,1010.09
age,4398.0,40586.656435,15200.700905,4208.0,29766.25,39567.0,50550.5,103079.0


In [188]:
data.describe(include='object').T

Unnamed: 0,count,unique,top,freq
geological_period,4398,11,Cambrian,882
paleomagnetic_data,4398,2,Normal polarity,3160
surrounding_rock_type,4398,4,Sandstone,1497
stratigraphic_position,4398,3,Bottom,2667


In [189]:
Numerical_col=[]
Categorical_col=[]
for feature in data.columns:
    if data[feature].dtype!="object":
        Numerical_col.append(feature)
    else:
        Categorical_col.append(feature)

print(Categorical_col)
print(Numerical_col)

['geological_period', 'paleomagnetic_data', 'surrounding_rock_type', 'stratigraphic_position']
['uranium_lead_ratio', 'carbon_14_ratio', 'radioactive_decay_series', 'stratigraphic_layer_depth', 'inclusion_of_other_fossils', 'isotopic_composition', 'fossil_size', 'fossil_weight', 'age']


<b>Data Preprocessing</b>

In [190]:
Q1 = data['age'].quantile(0.25)
Q3 = data['age'].quantile(0.75)

IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = (data['age'] < lower_bound) | (data['age'] > upper_bound)
data = data[~outliers]

In [191]:
data = data.drop(['isotopic_composition', 'surrounding_rock_type', 'fossil_weight'], axis=1)

In [192]:
X = data.drop(columns=['age'])
y = data['age']

In [193]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib

In [194]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [195]:
numeric_features = ['uranium_lead_ratio', 'carbon_14_ratio', 'radioactive_decay_series', 
                    'stratigraphic_layer_depth', 'fossil_size']
categorical_features = ['geological_period', 'paleomagnetic_data', 'stratigraphic_position']
boolean_features = ['inclusion_of_other_fossils']

In [196]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),               
        ('cat', OneHotEncoder(), categorical_features),                
        ('bool', 'passthrough', boolean_features)                       
    ]
)

In [197]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),                                   
    ('regressor', TransformedTargetRegressor(
        regressor=LinearRegression(),
        transformer=StandardScaler()                                   
    ))
])

In [198]:
pipeline.fit(X_train, y_train)

In [199]:
y_pred = pipeline.predict(X_test)

In [200]:
mse_p = mean_squared_error(y_test, y_pred)
r2_p = r2_score(y_test, y_pred)
mae_p = mean_absolute_error(y_test, y_pred)

print("Linear Regression Model")
print("Mean Absolute Error:", mae_p)
print("Mean Squared Error:", mse_p)
print("R^2 Score:", r2_p)

joblib.dump(pipeline, 'regression.pkl')
print("Pipeline berhasil disimpan.")

Linear Regression Model
Mean Absolute Error: 1293.5135563166334
Mean Squared Error: 2789622.6155990125
R^2 Score: 0.9858808420667581
Pipeline berhasil disimpan.


In [201]:
pipeline = joblib.load('regression.pkl')

new_data = pd.DataFrame({
    'uranium_lead_ratio': [0.8],
    'carbon_14_ratio': [0.6],
    'radioactive_decay_series': [0.5],
    'stratigraphic_layer_depth': [150],
    'geological_period': ['Cretaceous'],
    'paleomagnetic_data': ['Normal polarity'],
    'inclusion_of_other_fossils': [True],
    'isotopic_composition': [1.2],
    'surrounding_rock_type': ['Conglomerate'],
    'stratigraphic_position': ['Middle'],
    'fossil_size': [45.5],
    'fossil_weight': [300.0]
})

new_prediction = pipeline.predict(new_data)

print("Prediksi umur untuk data baru:", new_prediction)

Prediksi umur untuk data baru: [51855.81352647]
