In [1]:
import os
import datetime
from dateutil.relativedelta import relativedelta

import psycopg
import pandas as pd

##### 1. Определим глобальные перменные

In [2]:
TABLE_NAME = "users_churn"

##### 2. Заберем данные из базы данных и сформируем `dataframe`

In [3]:
connection = {"sslmode": "verify-full", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.getenv("POSTGRES_HOST"),
    "port": os.getenv("POSTGRES_PORT"),
    "dbname": os.getenv("POSTGRES_DBNAME"),
    "user": os.getenv("POSTGRES_USER"),
    "password": os.getenv("POSTGRES_PASSWORD"),
}

connection.update(postgres_credentials)

In [4]:
with psycopg.connect(**connection) as conn:

    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

df = pd.DataFrame(data, columns=columns)

##### 2. Посмотрим какие методы можно использовать для генерации новых признаков:
- lambda
- itertuples
- vectorize

In [5]:
def calculate_days_diff(row):
    current_date = datetime.datetime.now()
    begin_date = row['begin_date']
    return (current_date - begin_date).days

def calculate_diff_years(row):
    current_date = datetime.datetime.now()
    begin_date = row['begin_date']
    return relativedelta(current_date, begin_date).years

def calculate_diff_months(row):
    current_date = datetime.datetime.now()
    begin_date = row['begin_date']
    return relativedelta(current_date, begin_date).months

In [6]:
def calculate_days_diff_itertools(row):
    current_date = datetime.datetime.now()
    begin_date = row.begin_date
    return (current_date - begin_date).days

def calculate_diff_years_itertools(row):
    current_date = datetime.datetime.now()
    begin_date = row.begin_date
    return relativedelta(current_date - begin_date).years

def calculate_diff_months_itertools(row):
    current_date = datetime.datetime.now()
    begin_date = row.begin_date
    return relativedelta(current_date - begin_date).months

In [7]:
def calculate_days_diff_numpy(current_day, row):
    date_diff = (current_day - row).days

    return date_diff

def calculate_diff_years_numpy(current_day, row):
    date_diff = (current_day - row).days // 365

    return date_diff

def calculate_diff_months_numpy(current_day, row):
    date_diff = (current_day - row).days // 30

    return date_diff

In [14]:
df.shape

(7043, 25)

##### 2.1 Lambda


In [8]:
%%time

df['days_diff'] = df.apply(lambda row: calculate_days_diff(row), axis=1)
df['diff_years'] = df.apply(lambda row: calculate_diff_years(row), axis=1)
df['diff_months'] = df.apply(lambda row: calculate_diff_months(row), axis=1)

CPU times: user 278 ms, sys: 5.56 ms, total: 283 ms
Wall time: 281 ms


##### 2.3 Itertools

In [9]:
%%time


df['days_diff'] = [calculate_days_diff_itertools(row) for row in df.itertuples()]
df['diff_years'] = [calculate_diff_years_itertools(row) for row in df.itertuples()]
df['diff_months'] = [calculate_diff_months_itertools(row) for row in df.itertuples()]

CPU times: user 157 ms, sys: 3.79 ms, total: 161 ms
Wall time: 158 ms


##### 2.4.1 Vectorize using pandas

In [10]:
%%time


current_date = datetime.datetime.now()

df['days_diff'] = (current_date - df['begin_date']).dt.days
df['diff_years'] = (current_date - df['begin_date']) // pd.Timedelta(days=365)
df['diff_months'] = (current_date - df['begin_date']) // pd.Timedelta(days=30)

CPU times: user 2.45 ms, sys: 1.44 ms, total: 3.89 ms
Wall time: 2.69 ms


##### 2.4 Vectorize using numpy

In [16]:
%%time


current_day = np.datetime64(datetime.datetime.now())

calculate_days_diff_v = np.vectorize(calculate_days_diff_numpy, otypes=[np.int64])
calculate_diff_years_v = np.vectorize(calculate_diff_years_numpy, otypes=[np.int64])
calculate_diff_months_v = np.vectorize(calculate_diff_months_numpy, otypes=[np.int64])

df['days_diff'] = calculate_days_diff_v(current_day, df["begin_date"])
df['diff_years'] = calculate_diff_years_v(current_day, df["begin_date"])
df['diff_months'] = calculate_diff_months_v(current_day, df["begin_date"])

CPU times: user 113 ms, sys: 3.02 ms, total: 116 ms
Wall time: 116 ms


#### 3. Random data (without case)

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Генерация случайных дат
start_date = datetime(2022, 1, 1)
end_date = datetime.now()

random_dates = [
    start_date + timedelta(days=np.random.randint((end_date - start_date).days))
    for _ in range(1000000)
]

# Создание DataFrame
data = {
    'random_dates': random_dates,
    'random_numbers': np.random.rand(1000000)
}

df = pd.DataFrame(data)
methods_time = {}

In [3]:
# Подсчет времени выполнения для каждого метода и вычисление разницы в днях
for method in ['np_vectorize', 'for_idx', 'for_row', 'for_itertuples', 'native_pandas', 'apply']:
    
    if method == 'np_vectorize':
        start_time = datetime.now()

        def np_diff_date_days(current_day, row):
            date_diff = (current_day - row).days

            return date_diff

        current_date_numpy = np.datetime64(datetime.now())
        np_diff_date_days_vectorize = np.vectorize(np_diff_date_days, otypes=[np.int64])
        temp = np_diff_date_days_vectorize(current_date_numpy, df["random_dates"])

        methods_time[method] = (datetime.now() - start_time).seconds
    
    elif method == 'for_idx':
        start_time = datetime.now()
        
        temp = []
        current_date = datetime.now()
        for idx in range(0, df.shape[0], 1):
            temp.append((current_date - df['random_dates'][idx]).days)

        methods_time[method] = (datetime.now() - start_time).seconds
        
    elif method == 'for_row':
        start_time = datetime.now()
        
        temp = []
        current_date = datetime.now()
        for i, row in df.iterrows():
            temp.append((current_date - row['random_dates']).days)

        methods_time[method] = (datetime.now() - start_time).seconds
    
    elif method == 'for_itertuples':
        start_time = datetime.now()
        
        temp = []
        current_date = datetime.now()
        for row in df.itertuples():
            temp.append((current_date - row.random_dates).days)

        methods_time[method] = (datetime.now() - start_time).seconds

    elif method == 'native_pandas':
        start_time = datetime.now()

        current_date = datetime.now()
        temp = (current_date - df['random_dates']).dt.days
        
        methods_time[method] = (datetime.now() - start_time).seconds
    
    elif method == 'apply':
        start_time = datetime.now()
        
        current_date = datetime.now()
        temp = df['random_dates'].apply(lambda x: (current_date - x).days)
        
        methods_time[method] = (datetime.now() - start_time).seconds

In [8]:
import matplotlib.pyplot as plt


# Extract keys and values from the dictionary
keys = list(methods_time.keys())
values = list(methods_time.values())

# Plotting the bar chart
plt.bar(keys, values)
plt.xlabel('Метод')
plt.xticks(rotation=45)
plt.ylabel('Время, секунды')
plt.title('Время расчета разницы в днях для двух дат')
plt.show()

In [22]:
import math
from datetime import datetime
import numpy as np
import pandas as pd


df = pd.DataFrame(data={'random_numbers': np.random.rand(1000000)})
methods_time = {}


for method in ['np_vectorize', 'for_idx', 'for_row', 'for_itertuples', 'native_pandas', 'apply']:
    
    if method == 'np_vectorize':
        start_time = datetime.now()

        temp = np.log(df['random_numbers'])

        methods_time[method] = (datetime.now() - start_time).microseconds
    
    elif method == 'for_idx':
        start_time = datetime.now()
        
        temp = []
        current_date = datetime.now()
        for idx in range(0, df.shape[0], 1):
            temp.append(math.log(df['random_numbers'].iloc[idx]))

        methods_time[method] = (datetime.now() - start_time).microseconds
        
    elif method == 'for_row':
        start_time = datetime.now()
        
        temp = []
        current_date = datetime.now()
        for i, row in df.iterrows():
            temp.append(math.log(row['random_numbers']))

        methods_time[method] = (datetime.now() - start_time).microseconds
    
    elif method == 'for_itertuples':
        start_time = datetime.now()
        
        temp = []
        current_date = datetime.now()
        for row in df.itertuples():
            temp.append(math.log(row.random_numbers))

        methods_time[method] = (datetime.now() - start_time).microseconds

    elif method == 'native_pandas':
        start_time = datetime.now()

        current_date = datetime.now()
        temp = df['random_numbers'].apply(lambda x: math.log(x))
        
        methods_time[method] = (datetime.now() - start_time).microseconds
    
    elif method == 'apply':
        start_time = datetime.now()
        
        current_date = datetime.now()
        temp = df['random_numbers'].apply(lambda x: math.log(x))
        
        methods_time[method] = (datetime.now() - start_time).microseconds

In [28]:
import matplotlib.pyplot as plt


# Extract keys and values from the dictionary
keys = list(methods_time.keys())
values = list(methods_time.values())

# Plotting the bar chart
plt.bar(keys, values)
plt.xlabel('Метод')
plt.xticks(rotation=45)
plt.ylabel('Время, microseconds')
plt.title('Время вычисления логарифма')
plt.show()