Importing necessary modules

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score



Loading the dataSet

In [7]:
# Load your dataset
data = pd.read_csv('data.csv')


In [None]:
Defining the features

In [8]:
# Define the features (X) and the target (y)
features = [
    'Income', 'Age', 'Dependents', 'Occupation', 'City_Tier', 'Rent',
    'Loan_Repayment', 'Insurance', 'Groceries', 'Transport', 'Eating_Out',
    'Entertainment', 'Utilities', 'Healthcare', 'Education', 'Miscellaneous',
    'Disposable_Income', 'Potential_Savings_Groceries', 'Potential_Savings_Transport',
    'Potential_Savings_Eating_Out', 'Potential_Savings_Entertainment',
    'Potential_Savings_Utilities', 'Potential_Savings_Healthcare',
    'Potential_Savings_Education', 'Potential_Savings_Miscellaneous'
]

target = 'Desired_Savings'  # Target column

X = data[features]
y = data[target]



In [9]:
categorical_columns = ['Occupation', 'City_Tier']
numerical_columns = list(set(features) - set(categorical_columns))

Preprocessing Data for Machine Learning with ColumnTransformer

In [10]:
# Preprocessing: OneHotEncode categorical columns, scale numerical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(), categorical_columns)
    ]
)

Creating a Machine Learning Pipeline with Preprocessing and a Model

In [11]:
# Create a pipeline with preprocessing and the model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

Spliting the data

In [12]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [13]:
# Train the pipeline
pipeline.fit(X_train, y_train)

In [14]:
# Predict on the test set
y_pred = pipeline.predict(X_test)

In [15]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")

Mean Squared Error: 5221543.737251726
R² Score: 0.9277198902244779


In [22]:
# Predict savings for the next month
sample_input = {
    'Income': 50000,
    'Age': 30,
    'Dependents': 2,
    'Occupation': 'Student',  # Example category
    'City_Tier': 'Tier_1',
    'Rent': 15000,
    'Loan_Repayment': 5000,
    'Insurance': 2000,
    'Groceries': 6000,
    'Transport': 3000,
    'Eating_Out': 2000,
    'Entertainment': 1000,
    'Utilities': 2000,
    'Healthcare': 1000,
    'Education': 2000,
    'Miscellaneous': 1000,
    'Disposable_Income': 12000,
    'Potential_Savings_Groceries': 500,
    'Potential_Savings_Transport': 200,
    'Potential_Savings_Eating_Out': 300,
    'Potential_Savings_Entertainment': 200,
    'Potential_Savings_Utilities': 100,
    'Potential_Savings_Healthcare': 100,
    'Potential_Savings_Education': 150,
    'Potential_Savings_Miscellaneous': 50
}

In [23]:
# Convert the input into a DataFrame
sample_df = pd.DataFrame([sample_input])

# Ensure the sample input has the same feature columns as the training data
sample_input_preprocessed = pipeline['preprocessor'].transform(sample_df)

# Predict savings using the preprocessed input
predicted_savings = pipeline['model'].predict(sample_input_preprocessed)
print(f"Predicted Savings for Next Month: {predicted_savings[0]}")

Predicted Savings for Next Month: 6355.96945229572
