In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/journal_synthetic.csv")
df.head()


In [None]:
# Read test data
test_df = pd.read_csv("../data/reconcillation_synthetic.csv")

In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Load CSV files
journal_df = pd.read_csv("../data/journal_synthetic.csv")
recon_df = pd.read_csv("../data/reconcillation_synthetic.csv")

# Filter for UnreportedTax > 0
recon_unreported = recon_df[recon_df['UnreportedTax'] > 0]

# Define common features
features = ['Region', 'County', 'Entity', 'Gross', 'Taxable', 'TaxRate', 'Year', 'Month']

# Combine both datasets for consistent one-hot encoding
combined_df = pd.concat([journal_df[features], recon_unreported[features]], axis=0)
encoded_df = pd.get_dummies(combined_df, drop_first=True)

# Separate encoded datasets
X_encoded = encoded_df[:len(journal_df)]
X_predict = encoded_df[len(journal_df):]

# Encode target variable (City)
le = LabelEncoder()
y = le.fit_transform(journal_df['City'])

# Split training set for evaluation
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Train Logistic Regression
model = RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42)
model.fit(X_train, y_train)

# Evaluate model
y_pred_test = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_test)
report = classification_report(y_test, y_pred_test)

# Predict on reconciliation entries
y_pred_recon = model.predict(X_predict)
predicted_cities = le.inverse_transform(y_pred_recon)

# Attach predictions to the filtered reconciliation data
recon_unreported = recon_unreported.copy()
recon_unreported['Predicted_City'] = predicted_cities

# Save output
recon_unreported.to_csv("predicted_reconciliation_results.csv", index=False)

# Print sample and metrics
print("Sample Predictions:\n", recon_unreported[['Region', 'County', 'Entity', 'UnreportedTax', 'Predicted_City']].head())
print("\nAccuracy on Test Set: {:.2f}%".format(accuracy * 100))
print("\nClassification Report:\n", report)


Sample Predictions:
     Region         County    Entity  UnreportedTax Predicted_City
51   US-CA  San Francisco    DeMart          33.96  San Francisco
141  US-NC           Wake    DeMart          48.22           Cary
247  US-MI        Oakland  PizzaHut         729.51           Troy
273  US-IL         DuPage    DeMart         803.61        Wheaton
306  US-PA      Allegheny  Reliance         604.08    Monroeville

Accuracy on Test Set: 97.20%

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      3406
           1       1.00      1.00      1.00      3359
           2       1.00      1.00      1.00      3403
           3       1.00      1.00      1.00      3321
           4       1.00      1.00      1.00      3303
           5       1.00      1.00      1.00      3371
           6       1.00      1.00      1.00      3191
           7       1.00      1.00      1.00      3374
           8       1.00      1.00      1.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
