<a href="https://colab.research.google.com/github/sentongo-web/MASTERS_DEGREE_PROJECT_MSDS/blob/main/Training_CT_GAN_on_real_imports_declaration_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## We train CT GAN on real import declaration dataset from 2013-2023

Further Reading about CT GAN: https://pypi.org/project/ctgan/


In [10]:
import pandas as pd
from ctgan import CTGAN
from sklearn.model_selection import train_test_split

In [11]:
#Mounting the drive since dataset is saved on the drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
#setting path for the data dictionary and the dataset
file_path = '/content/drive/My Drive/ScaleDown.xlsx'
dict_path = '/content/drive/My Drive/Data_Dictionary.xlsx'

In [13]:
#Reading the dataset and the data dictionary
data = pd.read_excel(file_path)
data_dict = pd.read_excel(dict_path)

In [14]:
#To check the data types of the variables in the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 85 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   INSTANCEID       1048575 non-null  int64  
 1   KEY_ITM_NBR      1048575 non-null  int64  
 2   PCK_NBR          992454 non-null   float64
 3   CONC             1048575 non-null  int64  
 4   TAR_SUP_QTY      113 non-null      float64
 5   PCK_MRK1         983757 non-null   object 
 6   PCK_MRK2         405762 non-null   object 
 7   PCK_TYP_COD      757517 non-null   object 
 8   PCK_TYP_NAM      990697 non-null   object 
 9   TAR_HSC_NB1      1047822 non-null  float64
 10  TAR_PRC_EXT      992460 non-null   float64
 11  TAR_PRC_NAT      992443 non-null   float64
 12  TAR_QUO          6174 non-null     object 
 13  TAR_PRI          697458 non-null   float64
 14  TAR_VMT          7 non-null        object 
 15  TAR_AIC          0 non-null        float64
 16  GDS_ORG_CTY      1

Shape of the original dataset

In [15]:
# To check the shape of the dataset
data.shape

(1048575, 85)

In [17]:
import numpy as np
#Creating UNIT_PRICE_LOCAL
    # Handling potential division by zero or missing values

data['UNIT_PRICE_LOCAL'] = np.where(
        (data['TAR_SUP_QTY'].notna()) & (data['TAR_SUP_QTY'] != 0),
        data['VIT_INV_AMT_NMU'] / data['TAR_SUP_QTY'],
        np.nan
    )

In [18]:
columns_to_keep = [
    'TAR_HSC_NB1',       # Harmonized system commodity code (HS code)
    'GDS_ORG_CTY',       # Country of origin (code)
    'VIT_WGT_GRS',       # Gross mass (including packaging)
    'VIT_WGT_NET',       # Net mass (excluding packaging)
    'TAR_PRI',           # Item price
    'VIT_CIF',           # CIF (Cost, Insurance, and Freight) value in local currency
    'TAX_AMT',           # Duties and taxes amount for the item
    'UNIT_PRICE_LOCAL',  # Unit price in local currency
    'TAR_VMT',           # Valuation method
    'VIT_INV_AMT_NMU',   # Invoice amount in national monetary units
    'VIT_INV_CUR_COD',   # Currency code for the invoice
    'VIT_CST',           # Amount of added costs in national currency
    'VIT_IFR_AMT_NMU',   # Amount of internal freight in national currency
    'VIT_OTC_AMT_NMU'    # Additional charges in national monetary units
]

data_processed = data[columns_to_keep].copy()

In [19]:
# Select the columns
data_processed = data[columns_to_keep].copy()

In [20]:
# Rename the columns (if needed, otherwise you can skip this step)
columns_rename = {
    'TAR_HSC_NB1': 'HS_Code',
    'GDS_ORG_CTY': 'Country_of_Origin',
    'VIT_WGT_GRS': 'Gross_Mass',
    'VIT_WGT_NET': 'Net_Mass',
    'TAR_PRI': 'Item_Price',
    'VIT_CIF': 'CIF_Value',
    'TAX_AMT': 'Duties_Taxes',
    'UNIT_PRICE_LOCAL': 'Unit_Price_Local',
    'TAR_VMT': 'Valuation_Method',
    'VIT_INV_AMT_NMU': 'Invoice_Amount_NMU',
    'VIT_INV_CUR_COD': 'Invoice_Currency_Code',
    'VIT_CST': 'Added_Costs',
    'VIT_IFR_AMT_NMU': 'Internal_Freight_NMU',
    'VIT_OTC_AMT_NMU': 'Additional_Charges_NMU'
}

# Rename columns
data_processed.rename(columns=columns_rename, inplace=True)

# Export to Excel
data_processed.to_excel('uganda_imports.xlsx', index=False)

In [21]:
data_processed.head()

Unnamed: 0,HS_Code,Country_of_Origin,Gross_Mass,Net_Mass,Item_Price,CIF_Value,Duties_Taxes,Unit_Price_Local,Valuation_Method,Invoice_Amount_NMU,Invoice_Currency_Code,Added_Costs,Internal_Freight_NMU,Additional_Charges_NMU
0,85441900.0,CN,927.72,920.43,806.92,806.92,1035.37,,,806.92,USD,0.0,0.0,0.0
1,69010000.0,CN,21399.86,20000.0,2366.95,2366.95,380.94,,,2366.95,USD,0.0,0.0,0.0
2,83023000.0,CN,77.82,50.0,308.73,308.73,74.44,,,308.73,USD,0.0,0.0,0.0
3,94015000.0,CN,194.54,150.0,1285.39,1285.39,447.56,257.078,,1285.39,USD,0.0,0.0,0.0
4,94031000.0,CN,194.54,150.0,1344.86,1344.86,468.26,268.972,,1344.86,USD,0.0,0.0,0.0


In [22]:
# Export to Excel
output_file = 'uganda_imports.xlsx'
data_processed.to_excel(output_file, index=False)

In [23]:
# Verify the file is saved
import os
print("Files in the current directory:")
print(os.listdir())

# Provide a link to download the file
from google.colab import files
files.download(output_file)

Files in the current directory:
['.config', 'uganda_imports.xlsx', 'drive', 'sample_data']


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>