In [None]:
import pandas as pd
import undetected_chromedriver as uc
import sqlite3
import schedule
import time
import os
import subprocess
import psutil
import datetime
import msoffcrypto
from io import BytesIO
from selenium.webdriver.common.action_chains import ActionChains
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

In [None]:
def load_and_preprocess():

In [None]:
    DB_path = r"Y:\Melanox\stations\Rovner\KRIOT_V4_1_.xlsm"
    df = pd.read_excel(DB_path, sheet_name="Closed")

In [None]:
    print(df.head())
    print("----------")
    print("Columns:", df.columns)
    print("----------")
    print("Data Types:\n", df.dtypes)
    print("----------")
    print("Summary:\n", df.describe())

In [None]:
    df.columns = df.columns.str.strip()
    df.columns = df.columns.str.lower()
    df.columns = df.columns.str.replace(" ", "_")

    print("----------")
    print("Updated Columns:", df.columns)

In [None]:
    df['current_time'] = pd.to_datetime(df['current_time'], format='%Y-%m-%d %H:%M', errors='coerce')
    df['start_time'] = pd.to_datetime(df['start_time'], format='%Y-%m-%d %H:%M', errors='coerce')
    df['end_time'] = pd.to_datetime(df['end_time'], format='%Y-%m-%d %H:%M', errors='coerce')

In [None]:
    invalid_rows = df[~df['technician_sr_time'].astype(str).str.match(r'^\d{1,2}:\d{2}:\d{2}$', na=True) |
                          ~df['ticket_lifetime'].astype(str).str.match(r'^\d{1,2}:\d{2}:\d{2}$', na=True)]
    print("Invalid rows found:", invalid_rows.head(10))

In [None]:
    df = df[df['technician_sr_time'].astype(str).str.match(r'^\d{1,2}:\d{2}:\d{2}$', na=True) &
            df['ticket_lifetime'].astype(str).str.match(r'^\d{1,2}:\d{2}:\d{2}$', na=True)]
    df.reset_index(drop=True, inplace=True)
    print("Cleaned dataset size:", df.shape)

In [None]:
    df['technician_sr_time'] = pd.to_timedelta(df['technician_sr_time'])
    df['ticket_lifetime'] = pd.to_timedelta(df['ticket_lifetime'])

In [None]:
    print(df[['technician_sr_time', 'ticket_lifetime', 'current_time', 'start_time', 'end_time']].head())
    print(df.dtypes)

In [None]:
    unwanted_comments = ['duplicate', 'duplicates', 'dublicate', 'another shift replaced tb3905, station work properly', 'not needed', 'not relevant', 'no relevant', 'not actual', 'no actual', 'eror', 'error', 'Closed by S. Chebaniuk due to programm error', 'Test ticket', 'Test']
    df['comment'] = df['comment'].astype(str).str.lower()
    df = df[~df['comment'].isin(unwanted_comments)]
    df.reset_index(drop=True, inplace=True)
    print("----------")
    print("Rows after removing unwanted comments:",df.shape)

In [None]:
    correction_time = pd.Timedelta(minutes=30)
    df.loc[df['open_technician'] != df['close_technician'],'technician_sr_time'] = correction_time

In [None]:
    long_ticket_threshold = pd.Timedelta(hours=1, minutes=30)
    long_tickets_df = df[df['technician_sr_time'] > long_ticket_threshold]
    print("-----------")
    print(long_tickets_df.head())

In [None]:
    technician_stats = df.groupby('close_technician').agg(
        ticket_qty = ('cause_of_failure', 'count'),
        avg_ticket_time = ('technician_sr_time', 'mean')
    )

In [None]:
    long_ticket_stats = long_tickets_df.groupby('close_technician').agg(
        long_tickets_qty = ('cause_of_failure', 'count')
    )

In [None]:
    technician_stats = technician_stats.merge(long_ticket_stats, on='close_technician', how='left').fillna(0)

In [None]:
    technician_stats['avg_ticket_time'] = technician_stats['avg_ticket_time'].dt.total_seconds() / 60

In [None]:
    technician_stats['long_ticket_percentage'] = (technician_stats['long_tickets_qty'] / technician_stats['ticket_qty']) * 100

In [None]:
    print("----------")
    print(technician_stats.head())

In [None]:
    total_tickets = df.shape[0]
    total_long_tickets = long_tickets_df.shape[0]

In [None]:
    long_ticket_overall_percentage = (total_long_tickets / total_tickets) * 100
    print("----------")
    print("Total Tickets:", total_tickets)
    print("Total Long Tickets (>1:30):", total_long_tickets)
    print("Overall Percentage of Long Tickets:", long_ticket_overall_percentage)

In [None]:
    unique_open_techs = df['open_technician'].unique()
    unique_close_techs = df['close_technician'].unique()
    print("----------")
    print("Unique Open Technician IDs:", unique_open_techs)
    print("----------")
    print("Unique Close Technician IDs:", unique_close_techs)

In [None]:
    corrections = {
        "Alexey Bondorenko": "Alexey Bondarenko",
        "Alexei Bondorenko": "Alexey Bondarenko",
        "Gershon": "Gershon Yastrebov",
        "Gershen Yastrebov": "Gershon Yastrebov",
        "Andrey ": "Andrey Sobolev",
        "Josef Kon": "Joseph Kohn",
    }
    df['open_technician'] = df['open_technician'].replace(corrections)
    df['close_technician'] = df['close_technician'].replace(corrections)
    print("----------")
    print("Updated Open Technician IDs:", df['open_technician'].unique())
    print("----------")
    print("Updated Close Technician IDs:", df['close_technician'].unique())

In [None]:
    df[['pl', 'sfg/top/retest']] = df[['pl', 'sfg/top/retest']].fillna("None")
    print("----------")
    print(df[['pl', 'sfg/top/retest']].isna().sum())

In [None]:
    start_date = pd.to_datetime("2025-04-25")
    end_date = pd.to_datetime(datetime.datetime.today().date())
    start_time = datetime.time(0,0)
    end_time = datetime.time(23,59)

    #Filtered: by date + time
    filtered_df = df[
        (df['current_time'].dt.date >= start_date.date()) &
        (df['current_time'].dt.date <= end_date.date()) &
        (df['current_time'].dt.time >= start_time) &
        (df['current_time'].dt.time <= end_time)
    ].copy()

In [None]:
    filtered_df['date'] = filtered_df['current_time'].dt.date

    group_cols = ['date', 'close_technician', 'station_name:', 'sfg/top/retest', 'pl', 'cause_of_failure']
    grouped = filtered_df.groupby(group_cols).size().reset_index(name='count')

In [None]:
    return_events = grouped[grouped['count'] > 1].copy()
    return_events.sort_values(by=['date', 'close_technician'], inplace=True)

In [None]:
    print("\n=== Technician Return Events Summary (With Timestamps) ===")
    if return_events.empty:
        print("No return visits detected within the specified time range.")
    else:
        print(return_events.to_string(index=False))
        print("===========================================================")

In [None]:
    df.to_csv("tableau_cleaned_data.csv", index=False)
    df.to_csv(r"C:\Users\migerovn\Desktop\tableau_cleaned_data.csv", index=False)

In [None]:
    df['technician_sr_time'] = df['technician_sr_time'].dt.total_seconds() / 60
    df['ticket_lifetime'] = df['ticket_lifetime'].dt.total_seconds() / 60
    df['current_time'] = df['current_time'].dt.strftime('%Y-%m-%d %H:%M')
    df['start_time'] = df['start_time'].dt.strftime('%Y-%m-%d %H:%M')
    df['end_time'] = df['end_time'].dt.strftime('%Y-%m-%d %H:%M')
    df['delay'] = df['delay'].apply(lambda x: x.total_seconds() / 60 if pd.notnull(x) else None)
    print("----------")
    print(df[['delay']].head(10))
    print(df.dtypes)

In [None]:
    df.to_csv("tableau_cleaned_data.csv", index=False)
    df.to_csv(r"C:\Users\migerovn\Desktop\tableau_cleaned_data_1.csv", index=False)
    print("----------")
    print("Dataset successfully exported as CSV!")

In [None]:
    try:
        conn = sqlite3.connect("tableau_data.db")
        df.to_sql("tableau_dataset", conn, if_exists="replace", index=False)
        conn.close()
        print("----------")
        print("Dataset successfully exported to SQLite dataset!")
    except Exception as e:
        print("----------")
        print(f"ERROR: Failed to export to SQLite - {e}")

In [None]:
def run_etl():
    print("Running ETL process...")
    df = load_and_preprocess()
    print("ETL process completed.")

In [None]:
schedule.every(12).hours.do(run_etl)

In [None]:
while True:
    schedule.run_pending()
    time.sleep(60)