In [None]:
import numpy as np 
import pandas as pd
import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Columns

['Category', 'City', 'Country', 'Customer ID', 'Customer Name', 'Market',
       'Order Date', 'Order ID', 'Order Priority', 'Product ID',
       'Product Name', 'Region', 'Row ID', 'Segment', 'Ship Date', 'Ship Mode',
       'State', 'Sub-Category', 'Discount', 'Number of Records', 'Profit',
       'Quantity', 'Sales', 'Shipping Cost']


In [None]:
data = pd.read_csv('/kaggle/input/amazing-marketing-data-set/Dataset.csv')
data['Sales'] = pd.to_numeric(data.Sales, errors='coerce')
data['Year'] = pd.DatetimeIndex(data['Order Date']).year

In [None]:
from datetime import datetime

known_clients = []

dates_new_clients = {}
dates_old_clients = {}

dates = sorted(data['Order Date'].unique(), key=lambda x: datetime.strptime(x, '%d-%m-%Y'))
dates_clients = data[['Order Date', 'Customer ID', 'Profit']].groupby('Order Date')

unique_clients_by_date = dates_clients['Customer ID'].apply(lambda x: list(np.unique(x)))

for date in dates:
    date_clients = unique_clients_by_date[date]
    
    for client in date_clients:
        if client not in known_clients:
            if date not in dates_new_clients:
                dates_new_clients[date] = []
            dates_new_clients[date].append(client)
            known_clients.append(client)
        else:
            if date not in dates_old_clients:
                dates_old_clients[date] = []
            dates_old_clients[date].append(client)

dates_count_new_clients = {}
month_new_clients = {}

for date_client in dates_new_clients.items():
    dates_count_new_clients[date_client[0]] = len(date_client[1])
    
for date, value in dates_count_new_clients.items():
    if date[-7:] not in month_new_clients:
        month_new_clients[date[-7:]] = 0
    month_new_clients[date[-7:]] += value

In [None]:
import matplotlib.pyplot as plt

x = list(month_new_clients.keys());
y = list(month_new_clients.values());

plt.figure(figsize=(20, 9))
plt.bar(x, y)
plt.xlabel('Date') 
plt.ylabel('New clients') 
plt.show()
print(month_new_clients.values())

In [None]:
ds_new_clients = data.groupby(['Customer ID'], as_index=False).agg({"Order Date" : 'max'}).sort_values("Order Date")
ds_new_clients['Year'] = pd.DatetimeIndex(ds_new_clients['Order Date']).year

In [None]:
ds_new_clients.groupby(['Year']).agg({"Customer ID": lambda x: len(x)})

In [None]:
def cagr(start_value, end_value, num_periods):
    return ((end_value / start_value) ** (1 / (num_periods - 1)) - 1)

In [None]:
countries = data['Country'].unique()

countries_profit_by_yrs = data.groupby(['Country', 'Year']).agg(
    profit= pd.NamedAgg(column = 'Profit', aggfunc = 'sum')
).groupby(['Country']).apply(lambda x: list(x.profit))

countries_cagr = {}

for country in countries:
    periods = len(countries_profit_by_yrs[country]);
    if periods > 3:
        start = countries_profit_by_yrs[country][0]
        end = countries_profit_by_yrs[country][-1]
        countries_cagr[country] =  cagr(start, end, periods)

df = pd.DataFrame.from_dict({'country': list(countries_cagr.keys()), 'cagr': list(countries_cagr.values())}).sort_values(by= 'cagr', ascending=False)

df

In [None]:
bottom10countries = list(df.sort_values(by= 'cagr', ascending=True).head(10)['country'])
bottom10countries

In [None]:
new_ds = data[:-5129]

countries = new_ds['Country'].unique()

countries_profit_by_yrs = new_ds.groupby(['Country', 'Year']).agg(
    profit= pd.NamedAgg(column = 'Profit', aggfunc = 'sum')
).groupby(['Country']).apply(lambda x: list(x.profit))

countries_cagr = {}


for country in countries:
    periods = len(countries_profit_by_yrs[country]);
    if periods > 3:
        start = countries_profit_by_yrs[country][0]
        end = countries_profit_by_yrs[country][-1]
        countries_cagr[country] =  cagr(start, end, periods)

df = pd.DataFrame.from_dict({'country': list(countries_cagr.keys()), 'cagr': list(countries_cagr.values())}).sort_values(by= 'cagr', ascending=False)

df