<a href="https://colab.research.google.com/github/sentongo-web/MASTERS_DEGREE_PROJECT_MSDS/blob/main/New_Data_Generation_With_CTGAN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [33]:
#Mounting the drive since dataset is saved on the drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [34]:
#setting path for the data dictionary and the dataset
file_path = '/content/drive/My Drive/uganda_imports.xlsx'

In [35]:
# importing libraries to be used in the project
import pandas as pd
import numpy as np
import copy
import time
import pickle
from collections import defaultdict
from itertools import islice, combinations
from datetime import datetime as dt
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns
import matplotlib.pyplot as plt
from ctgan import CTGAN

In [36]:
#Reading the dataset and the data dictionary
data = pd.read_excel(file_path)

In [37]:
data.head(5)

Unnamed: 0,HS_Code,Country_of_Origin,Gross_Mass,Net_Mass,Item_Price,CIF_Value,Duties_Taxes,Unit_Price_Local,Valuation_Method,Invoice_Amount_NMU,Invoice_Currency_Code,Added_Costs,Internal_Freight_NMU,Additional_Charges_NMU
0,85441900.0,CN,927.72,920.43,806.92,806.92,1035.37,,,806.92,USD,0.0,0.0,0.0
1,69010000.0,CN,21399.86,20000.0,2366.95,2366.95,380.94,,,2366.95,USD,0.0,0.0,0.0
2,83023000.0,CN,77.82,50.0,308.73,308.73,74.44,,,308.73,USD,0.0,0.0,0.0
3,94015000.0,CN,194.54,150.0,1285.39,1285.39,447.56,257.078,,1285.39,USD,0.0,0.0,0.0
4,94031000.0,CN,194.54,150.0,1344.86,1344.86,468.26,268.972,,1344.86,USD,0.0,0.0,0.0


In [38]:
# sample exchange rates to UGX (assuming these are provided)
exchange_rates = {
    'USD': 3700,   # sample rate
    'EUR': 4000,   # sample rate
    # Would add other necessary exchange rates here
}

In [39]:
# Convert all monetary columns to UGX
monetary_columns = ['Item_Price', 'CIF_Value', 'Duties_Taxes', 'Unit_Price_Local',
                    'Invoice_Amount_NMU', 'Added_Costs', 'Internal_Freight_NMU', 'Additional_Charges_NMU']

In [40]:
for column in monetary_columns:
    # since VIT_INV_CUR_COD indicates the currency of the monetary columns
    data[column] = data.apply(lambda row: row[column] * exchange_rates.get(row['Invoice_Currency_Code'], 1), axis=1)


In [41]:
# Drop the currency code column as all values are now in UGX
data.drop(columns=['Invoice_Currency_Code'], inplace=True)

In [42]:
data.head(10)

Unnamed: 0,HS_Code,Country_of_Origin,Gross_Mass,Net_Mass,Item_Price,CIF_Value,Duties_Taxes,Unit_Price_Local,Valuation_Method,Invoice_Amount_NMU,Added_Costs,Internal_Freight_NMU,Additional_Charges_NMU
0,85441900.0,CN,927.72,920.43,2985604.0,2985604.0,3830869.0,,,2985604.0,0.0,0.0,0.0
1,69010000.0,CN,21399.86,20000.0,8757715.0,8757715.0,1409478.0,,,8757715.0,0.0,0.0,0.0
2,83023000.0,CN,77.82,50.0,1142301.0,1142301.0,275428.0,,,1142301.0,0.0,0.0,0.0
3,94015000.0,CN,194.54,150.0,4755943.0,4755943.0,1655972.0,951188.6,,4755943.0,0.0,0.0,0.0
4,94031000.0,CN,194.54,150.0,4975982.0,4975982.0,1732562.0,995196.4,,4975982.0,0.0,0.0,0.0
5,85181000.0,CN,77.82,50.0,380767.0,380767.0,132497.0,190383.5,,380767.0,0.0,0.0,0.0
6,94031000.0,CN,972.72,900.0,5711542.0,5711542.0,1988713.0,228461.68,,5711542.0,0.0,0.0,0.0
7,85163100.0,CN,155.64,124.0,609242.0,609242.0,342435.0,152310.5,,609242.0,0.0,0.0,0.0
8,39249010.0,CN,77.82,50.0,636918.0,636918.0,153587.0,,,636918.0,0.0,0.0,0.0
9,84141000.0,CN,38.91,25.0,318459.0,318459.0,42698.0,318459.0,,318459.0,0.0,0.0,0.0


In [43]:
# Handle missing values
# Only fill numeric columns with the mean
for column in data.select_dtypes(include=np.number).columns:
    data[column].fillna(data[column].mean(), inplace=True)

In [44]:
# For non-numeric columns, you might want to fill with a different strategy,
# like the most frequent value (mode) or a placeholder like 'Unknown'
for column in data.select_dtypes(exclude=np.number).columns:
    data[column].fillna(data[column].mode()[0], inplace=True) # Use mode
    # or
    # data[column].fillna('Unknown', inplace=True) # Use a placeholder

In [45]:
# Add additional features
data['EXCHANGE_RATE'] = data['Item_Price'] / data['Unit_Price_Local']  # Example calculation
data['IMPORTER_TYPE'] = np.random.choice(['Individual', 'Company', 'Government'], size=len(data))
data['PORT_OF_ENTRY'] = np.random.choice(['Port A', 'Port B', 'Port C'], size=len(data))
data['CLEARANCE_TIME'] = np.random.randint(1, 30, size=len(data))  # Example random clearance time in days
data['TRANSPORT_MODE'] = np.random.choice(['Air', 'Sea', 'Land'], size=len(data))

In [46]:
data.columns

Index(['HS_Code', 'Country_of_Origin', 'Gross_Mass', 'Net_Mass', 'Item_Price',
       'CIF_Value', 'Duties_Taxes', 'Unit_Price_Local', 'Valuation_Method',
       'Invoice_Amount_NMU', 'Added_Costs', 'Internal_Freight_NMU',
       'Additional_Charges_NMU', 'EXCHANGE_RATE', 'IMPORTER_TYPE',
       'PORT_OF_ENTRY', 'CLEARANCE_TIME', 'TRANSPORT_MODE'],
      dtype='object')

In [47]:
# Save the original categorical columns
original_categorical = data[['Country_of_Origin', 'IMPORTER_TYPE', 'PORT_OF_ENTRY', 'TRANSPORT_MODE']].copy()


In [48]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# Encode categorical columns for CTGAN
label_encoders = {}
for column in original_categorical.columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

##Training the CTGAN MODEL

In [51]:
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets (although for CTGAN, we generally use all data for training)
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
