In [9]:
import pandas as pd
import pandas as pd
import plotly.express as px
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


df_income = pd.read_excel("./data/expenses.xlsx", sheet_name="Income")
df_expenses = pd.read_excel("./data/expenses.xlsx", sheet_name="Expenses")

In [10]:
# Droping some columns that were unnecessary..
col_to_drop = [
    "Transaction amount in transaction currency",
    "Transaction currency",
    "Amount in account currency",
    "Account currency"
]
df_income.drop(columns=col_to_drop, inplace=True)
df_expenses.drop(columns=col_to_drop, inplace=True)


# adding type to each df 
df_income["Type"] = "Income"
df_expenses["Type"] = "Expenses"

In [11]:
# Joining these two dataframes of expenses and income into one 
df = pd.concat([df_income, df_expenses], ignore_index=True)
# Renaming some columns for better understanding and prediciton
df = df.rename(columns={
    'Date and time': 'Date', 
    'Amount in default currency': 'Amount',
    'Default currency' : 'Currency'
    })
df

Unnamed: 0,Date,Category,Account,Amount,Currency,Comment,Type
0,"March 31, 2024",Balancing,Main,16.00,CAD,,Income
1,"March 28, 2024",Paycheck,Main,1427.19,CAD,,Income
2,"March 15, 2024",Paycheck,Main,1427.19,CAD,,Income
3,"March 3, 2024",Balancing,Main,6.00,CAD,,Income
4,"March 1, 2024",Paycheck,Main,1427.19,CAD,,Income
...,...,...,...,...,...,...,...
193,"May 5, 2023",Transportation,Main,9.68,CAD,,Expenses
194,"May 5, 2023",Transportation,Main,4.46,CAD,,Expenses
195,"May 5, 2023",Withdrawal,Main,300.00,CAD,,Expenses
196,"May 3, 2023",Groceries,Main,74.52,CAD,Utensils and some snacks,Expenses


In [12]:


# 1. Data Cleaning
# Convert 'Date' column to datetime
df['Date'] = pd.to_datetime(df['Date'])

# # Drop 'Comment' column if not relevant
# df.drop('Comment', axis=1, inplace=True)

fig = px.line(df, x='Date', y='Amount', color='Type', title='Income and Expenses Over Time')
fig.update_layout(width=1080, height=700)  # Set width and height
fig.show()

# Plot distribution of expenses across categories
fig = px.histogram(df[df['Type'] == 'Expenses'], x='Category', y='Amount', title='Distribution of Expenses Across Categories')
fig.update_layout(width=1080, height=700)  # Set width and height
fig.show()




In [13]:

# Drop columns that won't be used for modeling
df_model = df.drop(['Date', 'Currency', 'Comment', 'Type'], axis=1)

# Assuming 'Amount' is the target variable and everything else are features
X = df_model.drop('Amount', axis=1)
y = df_model['Amount']

# Using ColumnTransformer to apply OneHotEncoder to our categorical features
categorical_features = ['Category', 'Account']
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(), categorical_features)],
    remainder='passthrough')

X_processed = preprocessor.fit_transform(X)

# Splitting the dataset into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Modeling
model = LinearRegression()
model.fit(X_train, y_train)

# Predicting
y_pred = model.predict(X_test)

# Evaluating the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# 5. Visualize predictions
# Plot actual vs. predicted expenses
fig = px.scatter(x=y_test, y=y_pred, labels={'x': 'Actual Expenses', 'y': 'Predicted Expenses'},
                 title='Actual vs. Predicted Expenses')
fig.show()



Mean Squared Error: 1239478.6586931827
