In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv(r"C:\Users\user\ml projects\ppf\notebooks\personal_finance_tracker_dataset.csv", sep=",")
df.head()

Unnamed: 0,date,user_id,monthly_income,monthly_expense_total,savings_rate,budget_goal,financial_scenario,credit_score,debt_to_income_ratio,loan_payment,...,discretionary_spending,essential_spending,income_type,rent_or_mortgage,category,cash_flow_status,financial_advice_score,financial_stress_level,actual_savings,savings_goal_met
0,2019-01-01,1584,3119.58,3212.07,0.38,3676.11,inflation,721.0,0.56,125.77,...,857.55,1910.85,Freelance,1501.65,Investments,Positive,8.3,Low,0.0,0
1,2019-01-31,1045,3262.44,3732.81,0.1,2607.17,inflation,670.0,0.42,454.19,...,534.51,3165.2,Salary,1603.17,Investments,Positive,22.6,Low,0.0,0
2,2019-03-02,1756,2931.2,3335.58,0.15,3004.14,inflation,691.0,0.24,971.82,...,353.67,1504.56,Freelance,1097.82,Healthcare,Positive,58.8,Low,0.0,0
3,2019-04-01,1724,3506.79,2327.59,0.17,3346.97,normal,717.0,0.16,482.76,...,594.08,1450.72,Freelance,1155.64,Groceries,Positive,74.5,Low,1179.2,0
4,2019-05-01,1600,4606.87,2182.58,0.34,2670.09,inflation,795.0,0.25,263.74,...,556.86,1000.0,Salary,1170.86,Utilities,Negative,38.7,High,2424.29,0


In [3]:
print(df.info())
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 25 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   date                    3000 non-null   object 
 1   user_id                 3000 non-null   int64  
 2   monthly_income          3000 non-null   float64
 3   monthly_expense_total   3000 non-null   float64
 4   savings_rate            3000 non-null   float64
 5   budget_goal             3000 non-null   float64
 6   financial_scenario      3000 non-null   object 
 7   credit_score            3000 non-null   float64
 8   debt_to_income_ratio    3000 non-null   float64
 9   loan_payment            3000 non-null   float64
 10  investment_amount       3000 non-null   float64
 11  subscription_services   3000 non-null   int64  
 12  emergency_fund          3000 non-null   float64
 13  transaction_count       3000 non-null   int64  
 14  fraud_flag              3000 non-null   

In [4]:
df = df.drop(columns=['date', 'user_id', 'savings_rate', 'budget_goal', 'debt_to_income_ratio', 'transaction_count', 'fraud_flag', 'discretionary_spending', 'essential_spending', 'financial_advice_score', 'savings_goal_met', 'financial_stress_level', 'category', 'cash_flow_status'])

In [5]:
df.head()

Unnamed: 0,monthly_income,monthly_expense_total,financial_scenario,credit_score,loan_payment,investment_amount,subscription_services,emergency_fund,income_type,rent_or_mortgage,actual_savings
0,3119.58,3212.07,inflation,721.0,125.77,689.22,3,510.58,Freelance,1501.65,0.0
1,3262.44,3732.81,inflation,670.0,454.19,360.34,4,1154.41,Salary,1603.17,0.0
2,2931.2,3335.58,inflation,691.0,971.82,0.0,5,1433.02,Freelance,1097.82,0.0
3,3506.79,2327.59,normal,717.0,482.76,182.06,5,227.37,Freelance,1155.64,1179.2
4,4606.87,2182.58,inflation,795.0,263.74,342.78,9,589.81,Salary,1170.86,2424.29


In [6]:
df.isnull().sum()

monthly_income           0
monthly_expense_total    0
financial_scenario       0
credit_score             0
loan_payment             0
investment_amount        0
subscription_services    0
emergency_fund           0
income_type              0
rent_or_mortgage         0
actual_savings           0
dtype: int64

In [7]:
categorical_cols =['income_type', 'financial_scenario']
target_col = 'actual_savings'
numeric_cols = df.columns.difference(categorical_cols + [target_col]).tolist()

x = df[categorical_cols + numeric_cols]
y = df[target_col]

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression

preprocessor = ColumnTransformer( 
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_cols)
    ]
)

In [9]:
from sklearn.model_selection import train_test_split

pipe = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LinearRegression())
])

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
pipe.fit(x_train, y_train)
print("Model trained successfully")

print(x.columns)

Model trained successfully
Index(['income_type', 'financial_scenario', 'credit_score', 'emergency_fund',
       'investment_amount', 'loan_payment', 'monthly_expense_total',
       'monthly_income', 'rent_or_mortgage', 'subscription_services'],
      dtype='object')


In [10]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

y_pred = pipe.predict(x_test)

mse = mean_squared_error(y_pred, y_test)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_pred, y_test)
r2 = r2_score(y_pred, y_test)
print(f'MSE: {mse:.4f}')
print(f'RMSE: {rmse:.4f}')
print(f'MAE: {mae:.4f}')
print(f'R2: {r2:.4f}')

MSE: 128580.1025
RMSE: 358.5807
MAE: 241.4114
R2: 0.8793


In [11]:
from sklearn.linear_model import Ridge, Lasso

models = {
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'Linear Regression': LinearRegression()
}

result={}
for name, model in models.items():
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    pipe.fit(x_train, y_train)
    y_pred = pipe.predict(x_test)
    r2 = r2_score(y_pred, y_test)
    mse = mean_squared_error(y_pred, y_test)
    result[name] = {'R2': r2, 'MSE': mse}

pd.DataFrame(result)

Unnamed: 0,Ridge,Lasso,Linear Regression
R2,0.879248,0.879198,0.879298
MSE,128525.20327,128316.825386,128580.102479


In [12]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'model__alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10],
    'model__max_iter': [1000, 5000, 10000]
}
rf = Pipeline([
        ('preprocessor', preprocessor),
        ('model', Lasso())
    ])

search = GridSearchCV(rf, param_grid, cv=4, scoring='r2', n_jobs=-1)
search.fit(x_train, y_train)
print("Best parameters:" , search.best_params_)
print("BestR2:", search.best_score_)

Best parameters: {'model__alpha': 0.1, 'model__max_iter': 1000}
BestR2: 0.9149220355207806


In [13]:
best_model = search.best_estimator_
best_model.fit(x_train, y_train)
y_pred = best_model.predict(x_test)
print(r2_score(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))

0.8791446979324554
128553.63554781461


In [14]:
import joblib

joblib.dump(pipe, r"C:\Users\user\ml projects\ppf\models\savings_model_compressed.pkl", compress=3)
print("Model saved successfully")

Model saved successfully


In [15]:
model = joblib.load(r"C:\Users\user\ml projects\ppf\models\savings_model_compressed.pkl")
print("Model loaded successfully")

Model loaded successfully


In [16]:
data = pd.DataFrame([{
    "monthly_income": 4000,
    "monthly_expense_total": 2500,
    "financial_scenario": 'normal',
    "credit_score": 400,
    "loan_payment": 200,
    "investment_amount": 250,
    "subscription_services": 3,
    "emergency_fund": 0,
    "income_type": 'Salary',
    "rent_or_mortgage": '1200'
}])


In [18]:

prediction = model.predict(data)
print("Predicted Savings: ", prediction[0])


Predicted Savings:  1522.0490796458694
