In [None]:
import pandas as pd
import numpy as np


def load_customer():
    # read the customers from the storage
    customers = pd.read_csv("customers.csv")
    # rename the column names with the CUSTOMER prefix
    new_column_names = {
        "REV_CURRENT_YEAR.1": "CUSTOMER_REV_CURRENT_YEAR",
        "REV_CURRENT_YEAR.2": "CUSTOMER_REV_PAST_YEAR",
        "OWNERSHIP": "CUSTOMER_OWNERSHIP",
        "CURRENCY": "CUSTOMER_CURRENCY"
    }
    customers.rename(columns=new_column_names, inplace=True)
    # map the country to the same format in the geo csv
    customers['COUNTRY'] = customers['COUNTRY'].map({'Switzerland': 'CH', 'France': 'FR'})
    # this column has no information because it's the same then CUSTOMER_REV_CURRENT_YEAR
    customers.drop(columns=['REV_CURRENT_YEAR'], inplace=True)
    return customers


def load_geo():
    geo = pd.read_csv('geo.csv')
    new_column_names = {
        "SALES_OFFICE": "GEO_SALES_OFFICE",
        "SALES_BRANCH": "GEO_SALES_BRANCH",
    }
    geo.rename(columns=new_column_names, inplace=True)
    return geo


def load_transaction():
    transactions = pd.read_csv("transactions.csv")
    transactions['CUSTOMER'] = transactions['CUSTOMER'].str.replace('"', '')
    transactions['CUSTOMER'] = transactions['CUSTOMER'].replace('#NV', np.nan)
    transactions['CUSTOMER'] = pd.to_numeric(transactions['CUSTOMER'], errors='coerce')
    # the end customer has many null values, now it indicates only if the customer has an end customer
    transactions['END_CUSTOMER'] = transactions['END_CUSTOMER'].fillna('No').apply(
        lambda x: x if x in ['No', 'Yes'] else 'Yes')
    # map the offer status to a boolean
    transactions['OFFER_STATUS'] = transactions['OFFER_STATUS'].apply(
        lambda x: 1 if x in ['WIN', 'Win', 'WON', 'Won'] else 0)
    return transactions


def load_data():
    transactions_geo = pd.merge(left=load_transaction(), right=load_geo(), on='SALES_LOCATION', how='left')
    transactions_geo_customer = pd.merge(left=transactions_geo, right=load_customer(), on=['CUSTOMER', "COUNTRY"],
                                         how='left')
    return transactions_geo_customer

In [None]:
geo = load_geo()
geo.head()

In [None]:
customer = load_customer()
customer.head()

In [None]:
transactions = load_transaction()
transactions.head()

In [None]:
df = load_data()
df.head()

In [None]:
def feature_customer(customers):
    # 1 Pound Sterling = 1.19 Euro
    # 1 US Dollar = 0.88 Euro
    # 1 Chinese Yuan = 0.14 Euro
    to_Euro = {'Pound Sterling': 1.19, 'US Dollar': 0.88, 'Chinese Yuan': 0.14, 'Euro': 1}
    customers["CURRENCY_FACTOR"] = customers["CURRENCY"].map(to_Euro)
    customers["REV_CURRENT_YEAR"] = customers["REV_CURRENT_YEAR"] * customers["CURRENCY_FACTOR"]
    customers["REV_PAST_YEAR"] = customers["REV_PAST_YEAR"] * customers["CURRENCY_FACTOR"]
    customers.drop(columns=["CURRENCY_FACTOR"], inplace=True)
    # The delta between the revenue of the last and the current year
    customers["REV_INCREASE"] = customers["REV_CURRENT_YEAR"] - customers["REV_PAST_YEAR"]
    customers['CREATION_YEAR'] = pd.to_datetime(customers['CREATION_YEAR'])
    customers['TIME_DELTA'] = 2022 - customers['CREATION_YEAR'].apply(lambda x: x.year)
    customers.drop(columns=['CREATION_YEAR'], inplace=True)