# eFarmersHub Data Analysis
eFarmersHub data are stored in `gds_database` for data visualization. There are 8 tables:
1. Income generating tables: Sale, Machine Rent & Advisory
2. Expenditure tables: Purchase, Processing, Expense
3. User table: This table stores all the user data

The script utilizes `SQLAlchemy` as a database toolkit for CRUD operation while `Pandas` is used for data manipulation.

In [20]:
# Import Modules
# data manipulation and analysis
import pandas as pd
import numpy as np

# database toolkit
from sqlalchemy import create_engine, MetaData, inspect, Table, Column, Integer, String, Date, Numeric, extract
from sqlalchemy.engine.url import URL
from sqlalchemy.sql import select

# read env file
from dotenv import load_dotenv
import os

# path handling
from pathlib import Path

# logging
import logging

In [21]:
# load env variables
dotenv_path = Path("./.env")
load_dotenv(dotenv_path=dotenv_path)

USERNAME = os.getenv("USERNAME")
PASSWORD = os.getenv("PASSWORD")
HOST = os.getenv("HOST")
PORT = os.getenv("PORT")
DATABASE = os.getenv("DATABASE")

### 1. Read Sale Table
Sale data are stored in `gds_sale_transactions` table. For financial analysis such as revenue and profit, `net_amount` and `cogs_amount` are considered.

In [22]:
def extract_sale(engine):
    """
    read gds_sale_transactions table from sql database and returns df
    :param engine: SQLAlchemy engine object
    :return df: sale dataframe
    """
    
    try:
        with engine.connect() as conn:
            query = """
                SELECT country_name, parent_name, user_region, user_type, user_name, user_id, customer_id,
                    customer_name, customer_mobile, market_type, business_category, transaction_date, transaction_id,
                    currency_exchange_rate, net_amount, cogs_amount, version
                FROM (
                    SELECT distinct *
                    FROM gds_database.gds_sale_transactions
                    WHERE country_name = 'Bangladesh'
                    AND year(transaction_date) = 2022
                ) sale;
                """
            df = pd.read_sql(query, conn)
    except Exception as e:
        logging.basicConfig(filename="./log", filemode="a", format="%(asctime)s - %(levelname)s - %(message)s",
            level=logging.ERROR)
        logging.error(e)
        
    return df

In [23]:
def transform_sale(df):
    """
    transform sale dataframe and returns df
    :param df: actual sale dataframe
    :return df: transformed dataframe
    """
    
    # drop duplicates
    df.drop_duplicates(inplace=True, ignore_index=True)
    
    # convert date_of_transaction to datetime
    df["transaction_date"] = pd.to_datetime(df["transaction_date"], format="%Y/%m/%d")

    # convert user_id to string
    df["user_id"] = df["user_id"].astype(str)
    df["customer_id"] = df["customer_id"].astype(str)
    df["customer_mobile"] = df["customer_mobile"].astype(str)
    df["transaction_id"] = df["transaction_id"].astype(str)

    # convert and round numerical columns
    df["net_amount"] = df["net_amount"].astype(float)
    df["cogs_amount"] = df["cogs_amount"].astype(float)
    df["currency_exchange_rate"] = df["currency_exchange_rate"].astype(float)

    # group by tansaction for revenue
    df = df.groupby(["country_name", "parent_name", "user_region", "user_type", "user_name", "user_id", "customer_id",
        "customer_name", "customer_mobile", "market_type", "business_category", "transaction_date", "transaction_id",
        "version"]) \
        .agg(cogs_amount=("cogs_amount", "sum"),
            net_amount=("net_amount", "sum"),
            currency_exchange_rate=("currency_exchange_rate", "mean")).reset_index()

    # profit & loss
    df.rename(columns={"net_amount" : "revenue"}, inplace=True)
    df["revenue_usd"] = round(df["revenue"] / df["currency_exchange_rate"], 4)
    df["profit"] = df["revenue"] - df["cogs_amount"]
    df['profit_usd'] = round(df["profit"] / df["currency_exchange_rate"], 4)

    # add transaction_type_level_2 column
    df["transaction_category"] = "Sale"

    # sorting data based on version and keep the latest version only
    df = df.sort_values(["country_name", "parent_name", "user_id", "transaction_id", "version"]) \
            .drop_duplicates(subset=["transaction_id"], keep="last")

    return df

### 2. Machine Rent
Machine Rent data are stored in `gds_machine_rent_transactions` table. For financial analysis such as revenue and profit, `net_amount` is considered.

**Note:** Depreciation is not being considered at the moment.

In [24]:
def extract_machine_rent(engine):
    """
    read gds_machine_rent_transactions table from sql database and returns df
    :param engine: SQLAlchemy engine object
    :return df: sale dataframe
    """
    
    try:
        with engine.connect() as conn:
            query = """
                SELECT country_name, parent_name, user_region, user_type, user_name, user_id, customer_id, customer_name,
                    customer_mobile, business_category, transaction_date, transaction_id, currency_exchange_rate,
                    net_amount, version
                FROM (
                    SELECT distinct *
                    FROM gds_database.gds_machine_rent_transactions
                    WHERE country_name = 'Bangladesh'
                    AND year(transaction_date) = 2022
                ) machine_rent;
                """
            df = pd.read_sql(query, conn)
    except Exception as e:
        logging.basicConfig(filename="./log", filemode="a", format="%(asctime)s - %(levelname)s - %(message)s",
            level=logging.ERROR)
        logging.error(e)
        
    return df

In [25]:
def transform_machine_rent(df):
    """
    transform machine_rent dataframe and returns df
    :param df: actual machine_rent dataframe
    :return df: transformed dataframe
    """
    # drop duplicates
    df.drop_duplicates(inplace=True, ignore_index=True)

    # convert date_of_transaction to datetime
    df["transaction_date"] = pd.to_datetime(df["transaction_date"], format="%Y/%m/%d")

    # convert user_id to string
    df["user_id"] = df["user_id"].astype(str)
    df["customer_id"] = df["customer_id"].astype(str)
    df["customer_mobile"] = df["customer_mobile"].astype(str)
    df["transaction_id"] = df["transaction_id"].astype(str)

    # convert and round numerical columns
    df["net_amount"] = df["net_amount"].astype(float)
    
    # group by tansaction for revenue
    df = df.groupby(["country_name", "parent_name", "user_region", "user_type", "user_name", "user_id", "customer_id",
        "customer_name", "customer_mobile", "business_category", "transaction_date", "transaction_id",
        "version"]) \
        .agg(net_amount=("net_amount", "sum"),
            currency_exchange_rate=("currency_exchange_rate", "mean")).reset_index()
    
    # profit & loss
    df.rename(columns={"net_amount" : "revenue"}, inplace=True)
    df["revenue_usd"] = round(df["revenue"] / df["currency_exchange_rate"], 4)
    df["profit"] = df["revenue"]
    df['profit_usd'] = round(df["profit"] / df["currency_exchange_rate"] ,4)

    # add market_type column
    df["market_type"] = "Farmer"
    df["transaction_category"] = "Machinery Rental"

    # sorting data based on version and keep the latest version only
    df = df.sort_values(["country_name", "parent_name", "user_id", "transaction_id", "version"]) \
            .drop_duplicates(subset=["transaction_id"], keep="last")

    return df

### 3. Advisory Service
Advisory data are stored in `gds_advisory_transactions` table. For financial analysis such as revenue and profit, `amount` is considered.

In [26]:
def extract_advisory(engine):
    """
    read advisory table from sql database and returns df
    :param engine: SQLAlchemy engine object
    :return df: sale dataframe
    """
    try:
        with engine.connect() as conn:
            query = """
                SELECT country_name, parent_name, user_region, user_type, user_name, user_id, customer_id, customer_name,
                    customer_mobile, business_categories, transaction_date, transaction_id, currency_exchange_rate,
                    amount, version
                FROM (
                    SELECT distinct *
                    FROM gds_database.gds_advisory_transactions
                    WHERE country_name = 'Bangladesh'
                    AND year(transaction_date) = 2022
                ) machine_rent;
                """
            df = pd.read_sql(query, conn)
    except Exception as e:
        logging.basicConfig(filename="./log", filemode="a", format="%(asctime)s - %(levelname)s - %(message)s", level=logging.ERROR)
        logging.error(e)
        
    return df

In [27]:
def transform_advisory(df):
    """
    transform advisory dataframe and returns df
    :param df: actual machine_rent dataframe
    :return df: transformed dataframe
    """
    
    # drop duplicates
    df.drop_duplicates(inplace=True, ignore_index=True)

    # convert date_of_transaction to datetime
    df["transaction_date"] = pd.to_datetime(df["transaction_date"], format="%Y/%m/%d")

    # convert user_id to string
    df["user_id"] = df["user_id"].astype(str)
    df["customer_id"] = df["customer_id"].astype(str)
    df["customer_mobile"] = df["customer_mobile"].astype(str)
    df["transaction_id"] = df["transaction_id"].astype(str)

    # convert and round numerical columns
    df["amount"] = df["amount"].astype(float)
    
    # group by tansaction for revenue
    df = df.groupby(["country_name", "parent_name", "user_region", "user_type", "user_name", "user_id", "customer_id",
        "customer_name", "customer_mobile", "business_categories", "transaction_date", "transaction_id",
        "version"]) \
        .agg(amount=("amount", "sum"),
            currency_exchange_rate=("currency_exchange_rate", "mean")).reset_index()
    
    # profit & loss
    df.rename(columns={"amount" : "revenue",
        "business_categories" : "business_category"}, inplace=True)
    df["revenue_usd"] = round(df["revenue"] / df["currency_exchange_rate"], 4)
    df["profit"] = df["revenue"]
    df['profit_usd'] = round(df["profit"] / df["currency_exchange_rate"] ,4)

    # add market_type column
    df["market_type"] = "Farmer"
    df["transaction_category"] = "Advisory"

    # sorting data based on version and keep the latest version only
    df = df.sort_values(["country_name", "parent_name", "user_id", "transaction_id", "version"]) \
            .drop_duplicates(subset=["transaction_id"], keep="last")

    return df

In [28]:
if __name__ == "__main__":
    # initiate connection to database
    connect_url = URL.create(
        "mysql+pymysql",
        username=USERNAME,
        password=PASSWORD,
        host=HOST,
        port=PORT,
        database=DATABASE
    )
    engine = create_engine(connect_url)

    # debug
    # with engine.connect() as conn:
    #     inspector = inspect(engine)
    #     table_names = inspector.get_table_names()
    #     print(table_names)

    # sale
    sale = extract_sale(engine)
    sale = transform_sale(sale)

    # machine rent
    machine_rent = extract_machine_rent(engine)
    machine_rent = transform_machine_rent(machine_rent)

    # advisory
    advisory = extract_advisory(engine)
    advisory = transform_advisory(advisory)

    df = pd.concat([sale, machine_rent, advisory], sort=False, ignore_index=True)
    df.to_csv("income.csv", index=False)