In [82]:
import pandas as pd
import os

In [83]:
artifacts_path = os.getcwd().removesuffix('notebook\\model')+'artifacts\\'


In [84]:
data = pd.read_csv(artifacts_path + 'eco-1990-2022.csv')

In [85]:
data['emp_pop_ratio'] = data['emp_pop_ratio'].fillna(data['emp_pop_ratio'].mean())
data['gdp_ppp'] = data['gdp_ppp'].fillna(data['gdp_ppp'].mean())
data['fr_ratio'] = data['fr_ratio'].fillna(data['fr_ratio'].mean())

In [86]:
from sklearn.preprocessing import LabelEncoder

# Label encode the 'Entity' column
label_encoder = LabelEncoder()
data['Entity_encoded'] = label_encoder.fit_transform(data['Entity'])

In [87]:
entity_mapper = dict(zip(label_encoder.transform(label_encoder.classes_),label_encoder.classes_))
print(entity_mapper)

{np.int64(0): 'Germany', np.int64(1): 'India', np.int64(2): 'United States'}


In [88]:
X = data[['Year','Entity_encoded']]  # Features
Y = data[['gdp_ppp', 'emp_pop_ratio', 'fr_ratio']]  # Targets

In [89]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [90]:
print(Y_train[:3].to_string())

         gdp_ppp  emp_pop_ratio  fr_ratio
49   4024.547309         48.119     2.867
70  11161.593200         55.299     1.988
68  10441.989949         53.800     2.039


In [91]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression

model = MultiOutputRegressor(LinearRegression())
model.fit(X_train, Y_train)

In [92]:
Y_pred = model.predict(X_test)

In [93]:
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(Y_test, Y_pred, multioutput='raw_values')
r2 = r2_score(Y_test, Y_pred, multioutput='raw_values')

# Define target names
target_names = ['gdp_ppp', 'emp_pop_ratio', 'fr_ratio']

# Print MSE and R² for each target
print("Performance Metrics for Each Target:")
for target, mse_value, r2_value in zip(target_names, mse, r2):
    print(f"- {target}: Mean Squared Error = {round(mse_value, 2)}, R-squared = {round(r2_value, 2)}")

Performance Metrics for Each Target:
- gdp_ppp: Mean Squared Error = 8111373.5, R-squared = 0.8
- emp_pop_ratio: Mean Squared Error = 74.03, R-squared = 0.1
- fr_ratio: Mean Squared Error = 0.41, R-squared = 0.17


In [94]:
pred_year_range = range(2023,2041)

In [95]:
test_data_prep = [{'Year':j,'Entity_encoded':i} for i in data['Entity_encoded'] .unique().tolist() for j in pred_year_range]

In [96]:
test_data_prep_df = pd.DataFrame(test_data_prep)

In [97]:
predicted_values = model.predict(test_data_prep_df)
pred_data = [{'Entity':entity_mapper[i['Entity_encoded']],'Year':i['Year'] ,'emp_pop_ratio': predicted_values[index][1],'fr_ratio':predicted_values[index][2],'gdp_ppp':predicted_values[index][0]}for index,i in enumerate(test_data_prep)]
pred_data_df = pd.DataFrame(pred_data)

In [98]:
result_df = pd.concat([pred_data_df,data[['Entity','Year','emp_pop_ratio','fr_ratio','gdp_ppp']]])

In [99]:
result_df.to_csv(artifacts_path+'results.csv',index=False)