In [78]:
import pandas as pd
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta


### all data generated in this section:
df_support_tickects = pd.read_csv("data/0_sim_support_tickets.csv")
df_support_m = pd.read_csv("data/0_sim_support_tickets_monthly_features.csv")
df_support_m_agg = pd.read_csv("data/0_sim_support_tickets_monthly_features_aggs.csv")
df_support_stats = pd.read_csv("data/0_sim_support_tickects_yearly_fuzzy.csv")
df_support_score = pd.read_csv("data/1_score_support.csv" )

df_support_tickects.shape, df_support_m.shape,df_support_m_agg.shape, df_support_stats.shape, df_support_score.shape

((1639, 9), (9891, 12), (6907, 26), (440, 5), (6907, 52))

In [80]:
df_support_m['status'].unique()

array(['Open', 'Resolved', 'Pending', 'In Progress', 'Closed'],
      dtype=object)

In [82]:
# Extracting the column names
columns = df_support_m.columns.tolist()

# Create a DataFrame for the columns
columns_df = pd.DataFrame(columns, columns=['Column Names'])
columns_df

Unnamed: 0,Column Names
0,FOM
1,ticket_id
2,account_id
3,contract_id
4,ticket_creation_date
5,ticket_close_date
6,response_time
7,resolution_time
8,priority
9,ticket_subject


In [86]:
### data for dashboard
import pandas as pd
import numpy as np

# -----------------------------
# 1. Prepare datetime fields and FOM
# -----------------------------
df_support_m['ticket_creation_date'] = pd.to_datetime(df_support_m['ticket_creation_date'])
df_support_m['ticket_close_date'] = pd.to_datetime(df_support_m['ticket_close_date'])
df_support_m['FOM'] = df_support_m['ticket_creation_date'].dt.to_period('M').dt.to_timestamp()

# -----------------------------
# 2. Create flags
# -----------------------------
df_support_m['is_repeated_issue'] = (df_support_m['num_previous_tickets_same_subject'] > 0).astype(int)
df_support_m['is_closed'] = df_support_m['status'].isin(['Resolved', 'Closed']).astype(int)
df_support_m['is_open'] = (df_support_m['status'] == 'Open').astype(int)

# -----------------------------
# 3. Monthly aggregations
# -----------------------------
monthly_agg = df_support_m.groupby('FOM').agg(
    total_tickets=('ticket_id', 'count'),
    total_closed=('is_closed', 'sum'),
    total_open=('is_open', 'sum'),
    total_repeated_issues=('is_repeated_issue', 'sum'),
    median_response_time=('response_time', 'median'),
    max_response_time=('response_time', 'max'),
    median_resolution_time=('resolution_time', 'median'),
    max_resolution_time=('resolution_time', 'max'),
    total_resolution_time=('resolution_time', 'sum'),
    total_high_priority=('priority', lambda x: (x == 'High').sum())
)

# -----------------------------
# 4. Rate calculations (safe division)
# -----------------------------
monthly_agg['close_ratio'] = np.where(
    monthly_agg['total_tickets'] != 0,
    (monthly_agg['total_closed'] / monthly_agg['total_tickets']) *100,
    0
)

monthly_agg['open_ratio'] = np.where(
    monthly_agg['total_tickets'] != 0,
    (monthly_agg['total_open'] / monthly_agg['total_tickets']) *100,
    0
)

monthly_agg['escalation_rate'] = np.where(
    monthly_agg['total_tickets'] != 0,
    (monthly_agg['total_high_priority'] / monthly_agg['total_tickets']) * 100,
    0
)

monthly_agg['repeat_contact_rate'] = np.where(
    monthly_agg['total_tickets'] != 0,
    (monthly_agg['total_repeated_issues'] / monthly_agg['total_tickets']) * 100,
    0
)

# -----------------------------
# 5. Month-over-Month % change (safe)
# -----------------------------
monthly_agg['close_rate_mom_pct'] = (
    monthly_agg['close_ratio'].pct_change().replace([np.inf, -np.inf], 0).fillna(0)
)

monthly_agg['open_rate_mom_pct'] = (
    monthly_agg['open_ratio'].pct_change().replace([np.inf, -np.inf], 0).fillna(0)
)

monthly_agg['ticket_volume_change_pct'] = (
    monthly_agg['total_tickets'].pct_change().replace([np.inf, -np.inf], 0).fillna(0)
)

monthly_agg['median_response_time_change_pct'] = (
    monthly_agg['median_response_time'].pct_change().replace([np.inf, -np.inf], 0).fillna(0)
)

monthly_agg['max_response_time_change_pct'] = (
    monthly_agg['max_response_time'].pct_change().replace([np.inf, -np.inf], 0).fillna(0)
)

monthly_agg['median_resolution_time_change_pct'] = (
    monthly_agg['median_resolution_time'].pct_change().replace([np.inf, -np.inf], 0).fillna(0)
)

monthly_agg['max_resolution_time_change_pct'] = (
    monthly_agg['max_resolution_time'].pct_change().replace([np.inf, -np.inf], 0).fillna(0)
)

monthly_agg['total_resolution_time_change_pct'] = (
    monthly_agg['total_resolution_time'].pct_change().replace([np.inf, -np.inf], 0).fillna(0)
)

monthly_agg['escalation_rate_change_pct'] = (
    monthly_agg['escalation_rate'].pct_change().replace([np.inf, -np.inf], 0).fillna(0)
)

monthly_agg['repeat_contact_rate_change_pct'] = (
    monthly_agg['repeat_contact_rate'].pct_change().replace([np.inf, -np.inf], 0).fillna(0)
)

# -----------------------------
# 6. Ticket subject and status breakdowns
# -----------------------------
top_subjects = df_support_m.groupby(['FOM', 'ticket_subject']).size().unstack(fill_value=0)
top_subjects.columns = [f"subject_{col.replace(' ', '_').lower()}" for col in top_subjects.columns]

ticket_status = df_support_m.groupby(['FOM', 'status']).size().unstack(fill_value=0)
ticket_status.columns = [f"status_{col.replace(' ', '_').lower()}" for col in ticket_status.columns]

# -----------------------------
# 7. Combine support behavior features
# -----------------------------
combined_df = monthly_agg.join(top_subjects, how='left')
combined_df = combined_df.join(ticket_status, how='left')
combined_df['cumulative_total_tickets'] = combined_df['total_tickets'].cumsum()
combined_df = combined_df.reset_index()

# -----------------------------
# 8. Support score features (df_support_score is assumed to exist)
# -----------------------------
df_support_score['FOM'] = pd.to_datetime(df_support_score['FOM'])

# Percentiles
score_percentiles = df_support_score.groupby('FOM')['support_score'].quantile([0.5, 0.9, 1.0]).unstack()
score_percentiles.columns = ['support_score_p50', 'support_score_p90', 'support_score_p100']

# Label counts and percentages
label_counts = df_support_score.groupby(['FOM', 'support_score_label']).size().unstack(fill_value=0)
label_counts.columns = [f"label_count_{col.lower()}" for col in label_counts.columns]

label_percentages = label_counts.div(label_counts.sum(axis=1), axis=0) * 100
label_percentages.columns = [f"label_pct_{col.split('_')[-1]}" for col in label_percentages.columns]

# Combine all support score features
support_score_features = score_percentiles.join(label_counts).join(label_percentages)

# -----------------------------
# 9. Merge with final dataframe
# -----------------------------
combined_df['FOM'] = pd.to_datetime(combined_df['FOM'])
support_score_features.index = pd.to_datetime(support_score_features.index)

final_df = combined_df.merge(support_score_features, on='FOM', how='left')

# -----------------------------
# 10. Final touches
# -----------------------------
final_df = final_df.round(2)

# Output shape as confirmation
final_df.shape


(81, 44)

In [88]:
# Replace inf with 'no baseline' and NaN with 0 for clean dashboard output
final_df = final_df.replace([np.inf, -np.inf], "no baseline").fillna(0)


In [92]:
final_df.to_csv("data/5_support_dashboard.csv", index=False)

In [76]:
final_df.tail()

Unnamed: 0,FOM,total_tickets,total_closed,total_open,total_repeated_issues,median_response_time,max_response_time,median_resolution_time,max_resolution_time,total_resolution_time,...,cumulative_total_tickets,support_score_p50,support_score_p90,support_score_p100,label_count_healthy,label_count_normal,label_count_risky,label_pct_healthy,label_pct_normal,label_pct_risky
76,2026-06-01,1,0,0,1,1.0,1,33.0,33,33,...,9866,0.25,0.48,0.75,18,5,0,78.26,21.74,0.0
77,2026-07-01,16,7,0,8,2.0,4,74.0,85,939,...,9882,0.25,0.38,0.5,12,5,0,70.59,29.41,0.0
78,2026-08-01,6,1,0,5,16.0,17,72.0,83,432,...,9888,0.25,0.38,0.5,12,5,0,70.59,29.41,0.0
79,2026-09-01,1,1,0,0,7.0,7,28.0,28,28,...,9889,0.25,0.36,0.62,7,5,0,58.33,41.67,0.0
80,2026-10-01,2,1,0,2,4.0,4,23.0,23,46,...,9891,0.31,0.66,0.75,5,3,0,62.5,37.5,0.0


In [52]:
df_support_m['status'].unique()

array(['Open', 'Resolved', 'Pending', 'In Progress', 'Closed'],
      dtype=object)

In [328]:
########################### raw suppoer data simulation

In [590]:
# Get the current working directory
%pwd

'C:\\Users\\samir\\OneDrive\\Desktop\\Keeper\\v0'

In [614]:
import pandas as pd
import numpy as np

def compute_monthly_features(contract_df, monthly_tickets_df):
    # Convert date columns to datetime
    contract_df["contract_start_date"] = pd.to_datetime(contract_df["contract_start_date"])
    contract_df["contract_end_date"] = pd.to_datetime(contract_df["contract_end_date"])
    monthly_tickets_df["FOM"] = pd.to_datetime(monthly_tickets_df["FOM"])
    monthly_tickets_df["ticket_creation_date"] = pd.to_datetime(monthly_tickets_df["ticket_creation_date"])
    monthly_tickets_df["ticket_close_date"] = pd.to_datetime(monthly_tickets_df["ticket_close_date"])

    # Merge contract data to bring in 'ischr' and 'isup'
    monthly_tickets_df = monthly_tickets_df.merge(
        contract_df[["account_id", "contract_id", "ischr", "isup"]],
        on=["account_id", "contract_id"],
        how="left"
    )

    # Create lookup dictionaries for contract start and end dates
    contract_start_lookup = contract_df.set_index(["account_id", "contract_id"])["contract_start_date"].to_dict()
    contract_end_lookup = contract_df.set_index(["account_id", "contract_id"])["contract_end_date"].to_dict()

    # Compute MOC (Month of Contract)
    monthly_tickets_df["MOC"] = monthly_tickets_df.apply(
        lambda row: ((row["FOM"] - contract_start_lookup.get((row["account_id"], row["contract_id"]), pd.NaT)).days // 30) + 1,
        axis=1
    )

    # Compute POC (Percentage of Contract Duration)
    monthly_tickets_df["contract_duration"] = monthly_tickets_df.apply(
        lambda row: ((contract_end_lookup.get((row["account_id"], row["contract_id"]), pd.NaT) -
                     contract_start_lookup.get((row["account_id"], row["contract_id"]), pd.NaT)).days // 30),
        axis=1
    )
    monthly_tickets_df["POC"] = monthly_tickets_df["MOC"] / monthly_tickets_df["contract_duration"]

    # Ensure unique ticket-level aggregation
    unique_tickets_df = monthly_tickets_df.drop_duplicates(subset=["account_id", "contract_id", "MOC", "POC", "FOM", "ticket_id"])

    # Compute aggregated features at contract-month level
    grouped = unique_tickets_df.groupby(["account_id", "contract_id", "MOC", "POC", "FOM"]).agg(
        num_created_tickets=("ticket_id", "count"),
        num_closed_tickets=("ticket_close_date", lambda x: x.notna().sum()),
        median_resolution_time=("resolution_time", lambda x: int(x.median()) if not x.isna().all() else 0),
        max_resolution_time=("resolution_time", "max"),
        median_response_time=("response_time", lambda x: int(x.median()) if not x.isna().all() else 0),
        max_response_time=("response_time", "max"),
        num_high_critical_tickets=("priority", lambda x: sum(x.isin(["High", "Critical"]))),
        num_open_tickets=("ticket_close_date", lambda x: x.isna().sum()),
        ischr=("ischr", "first"),
        isup=("isup", "first")
    ).reset_index()

    # Fix the issue where previous topic counts exceed unique tickets
    unique_subjects_df = unique_tickets_df.groupby(["account_id", "contract_id", "MOC", "POC", "FOM", "ticket_subject"]).size().reset_index(name="subject_count")
    
    # Aggregate median/max num_previous_tickets_same_subject per contract-month
    topic_counts = unique_subjects_df.groupby(["account_id", "contract_id", "MOC", "POC", "FOM"]).agg(
        median_num_previous_subject=("subject_count", lambda x: int(x.median())),
        max_num_previous_subject=("subject_count", "max")
    ).reset_index()

    # Merge the corrected topic counts back
    grouped = grouped.merge(topic_counts, on=["account_id", "contract_id", "MOC", "POC", "FOM"], how="left")

    # Month-over-Month (MoM) changes
    for col in [
        "num_created_tickets", "num_closed_tickets", "median_resolution_time", 
        "max_resolution_time", "median_response_time", "max_response_time", 
        "num_high_critical_tickets", "num_open_tickets", "median_num_previous_subject"
    ]:
        grouped[f"mom_{col}"] = grouped.groupby(["account_id", "contract_id"])[col].diff()

    return grouped

# Example usage:
final_features_df = compute_monthly_features(contract_df, monthly_tickets_df)


In [592]:
import pandas as pd
import numpy as np

def generate_support_tickets(contract_df):
    # Priority levels
    priority_levels = ["Low", "Medium", "High", "Critical"]
    issue_types = ["issue_1", "issue_2", "issue_3", "issue_4"]
    statuses = ["Open", "In Progress", "Pending", "Resolved", "Closed"]
    sentiment_levels = ["Positive", "Neutral", "Negative"]
    
    ticket_data = []
    monthly_ticket_data = []
    
    for _, row in contract_df.iterrows():
        account_id = row["account_id"]
        contract_id = row["contract_id"]
        contract_start = pd.to_datetime(row["contract_start_date"])
        contract_end = pd.to_datetime(row["contract_end_date"])
        contract_acv = row["clv_contract_with_churn"]  # Using ACV with churn adjustment
        churned = row["ischr"]
        
        # Determine number of tickets based on ACV and churn status
        base_tickets = np.random.randint(0, 3)  # Higher ACV -> More tickets
        if churned:
            base_tickets += np.random.randint(0, 2)  # Churned accounts may have more tickets
        
        num_tickets = np.random.randint(0, max(1, base_tickets+2))
        
        for _ in range(num_tickets):
            ticket_id = f"ticket_{np.random.randint(1000, 9999)}"
            ticket_creation_date = contract_start + pd.Timedelta(days=np.random.randint(0, (contract_end - contract_start).days - 30))
            max_days_until_close = (contract_end - ticket_creation_date).days
            ticket_close_date = ticket_creation_date + pd.Timedelta(days=np.random.randint(1, min(max_days_until_close, 90)))
            resolution_time = (ticket_close_date - ticket_creation_date).days
            
            # Assign priority and response time with error handling
            priority = np.random.choice(priority_levels)
            min_response_time = {"Critical": 1, "High": 1, "Medium": 1, "Low": 1}
            max_response_time = {"Critical": 2, "High": 5, "Medium": 10, "Low": 20}
            
            response_time = min(resolution_time - 1, np.random.randint(min_response_time[priority], max_response_time[priority]))
            response_time = max(response_time, 1)  # Ensure response time is at least 1
            
            # Assign sentiment based on priority and resolution time
            if priority in ["Critical", "High"] and resolution_time > 15:
                sentiment = "Negative"
            elif priority in ["Medium", "Low"] and resolution_time > 30:
                sentiment = "Negative"
            elif resolution_time < 5:
                sentiment = "Positive"
            else:
                sentiment = "Neutral"
            
            ticket_data.append({
               "ticket_id": ticket_id,
                "account_id": account_id,
                "contract_id": contract_id,
                "ticket_creation_date": ticket_creation_date,
                "ticket_close_date": ticket_close_date,
                "response_time": response_time,
                "resolution_time": resolution_time if 'end_of_month' in locals() and ticket_close_date <= end_of_month else None,
                "ticket_subject": np.random.choice(issue_types),
                "sentiment": sentiment
            })
            
            # Generate monthly records
            current_date = ticket_creation_date.replace(day=1)
            while current_date <= contract_end:
                if current_date < ticket_creation_date:
                    current_date += pd.DateOffset(months=1)
                    continue
                
                status = np.random.choice(statuses, p=[0.2, 0.3, 0.2, 0.2, 0.1])  # Weighted status assignment
                fom = current_date
                end_of_month = current_date + pd.DateOffset(months=1) - pd.Timedelta(days=1)
                
                monthly_ticket_data.append({
                    "FOM": fom,
                    "ticket_id": ticket_id,
                    "account_id": account_id,
                    "contract_id": contract_id,
                    "ticket_creation_date": ticket_creation_date,
                    "ticket_close_date": ticket_close_date if ticket_close_date <= end_of_month else None,
                    "response_time": response_time if ticket_creation_date + pd.Timedelta(days=response_time) <= end_of_month else None,
                    "resolution_time": resolution_time,
                    "priority": priority,
                    "ticket_subject": np.random.choice(issue_types),
                    "status": status
                })
                current_date += pd.DateOffset(months=1)
    
    return pd.DataFrame(ticket_data), pd.DataFrame(monthly_ticket_data)

# Example usage:
contract_df = pd.read_excel("data/0_sim_contract.xlsx")
support_tickets_df, monthly_tickets_df = generate_support_tickets(contract_df)


# Convert ticket creation date to datetime format
monthly_tickets_df["ticket_creation_date"] = pd.to_datetime(monthly_tickets_df["ticket_creation_date"])

# Sort data to track order of tickets within the same account, contract, and subject
monthly_tickets_df = monthly_tickets_df.sort_values(by=["account_id", "contract_id", "ticket_subject", "ticket_creation_date"])

# Add a column counting previous unique tickets with the same subject
monthly_tickets_df["num_previous_tickets_same_subject"] = monthly_tickets_df.groupby(["account_id", "contract_id", "ticket_subject"]).cumcount()


In [616]:
support_tickets_df.to_csv("data/0_sim_support_tickets.csv", index=False)
monthly_tickets_df.to_csv("data/0_sim_support_tickets_monthly_features.csv", index=False)

In [596]:
#monthly_tickets_df = pd.read_csv("data/0_sim_support_tickets_monthly_features.csv")

In [618]:
support_tickets_df.shape, monthly_tickets_df.shape

((1639, 9), (9891, 12))

In [620]:
monthly_tickets_df.columns

Index(['FOM', 'ticket_id', 'account_id', 'contract_id', 'ticket_creation_date',
       'ticket_close_date', 'response_time', 'resolution_time', 'priority',
       'ticket_subject', 'status', 'num_previous_tickets_same_subject'],
      dtype='object')

In [622]:
monthly_tickets_df['contract_id'].nunique()*12

11568

In [624]:
############ calculation features

In [626]:
import pandas as pd
import numpy as np

def compute_monthly_features(contract_df, monthly_tickets_df):
    # Convert date columns to datetime
    contract_df["contract_start_date"] = pd.to_datetime(contract_df["contract_start_date"])
    contract_df["contract_end_date"] = pd.to_datetime(contract_df["contract_end_date"])
    monthly_tickets_df["FOM"] = pd.to_datetime(monthly_tickets_df["FOM"])
    monthly_tickets_df["ticket_creation_date"] = pd.to_datetime(monthly_tickets_df["ticket_creation_date"])
    monthly_tickets_df["ticket_close_date"] = pd.to_datetime(monthly_tickets_df["ticket_close_date"])

    # Merge contract data to bring in 'ischr' and 'isup'
    monthly_tickets_df = monthly_tickets_df.merge(
        contract_df[["account_id", "contract_id", "ischr", "isup"]],
        on=["account_id", "contract_id"],
        how="left"
    )

    # Create lookup dictionaries for contract start and end dates
    contract_start_lookup = contract_df.set_index(["account_id", "contract_id"])["contract_start_date"].to_dict()
    contract_end_lookup = contract_df.set_index(["account_id", "contract_id"])["contract_end_date"].to_dict()

    # Compute MOC (Month of Contract)
    monthly_tickets_df["MOC"] = monthly_tickets_df.apply(
        lambda row: ((row["FOM"] - contract_start_lookup.get((row["account_id"], row["contract_id"]), pd.NaT)).days // 30) + 1,
        axis=1
    )

    # Compute POC (Percentage of Contract Duration)
    monthly_tickets_df["contract_duration"] = monthly_tickets_df.apply(
        lambda row: ((contract_end_lookup.get((row["account_id"], row["contract_id"]), pd.NaT) -
                     contract_start_lookup.get((row["account_id"], row["contract_id"]), pd.NaT)).days // 30),
        axis=1
    )
    monthly_tickets_df["POC"] = monthly_tickets_df["MOC"] / monthly_tickets_df["contract_duration"]

    # Ensure unique ticket-level aggregation
    unique_tickets_df = monthly_tickets_df.drop_duplicates(subset=["account_id", "contract_id", "MOC", "POC", "FOM", "ticket_id"])

    # Compute aggregated features at contract-month level
    grouped = unique_tickets_df.groupby(["account_id", "contract_id", "MOC", "POC", "FOM"]).agg(
        num_created_tickets=("ticket_id", "count"),
        num_closed_tickets=("ticket_close_date", lambda x: x.notna().sum()),
        median_resolution_time=("resolution_time", lambda x: int(x.median()) if not x.isna().all() else 0),
        max_resolution_time=("resolution_time", "max"),
        median_response_time=("response_time", lambda x: int(x.median()) if not x.isna().all() else 0),
        max_response_time=("response_time", "max"),
        median_num_previous_subject=("num_previous_tickets_same_subject", lambda x: int(x.median()) if not x.isna().all() else 0),
        max_num_previous_subject=("num_previous_tickets_same_subject", "max"),
        num_high_critical_tickets=("priority", lambda x: sum(x.isin(["High", "Critical"]))),
        num_open_tickets=("ticket_close_date", lambda x: x.isna().sum()),
        ischr=("ischr", "first"),
        isup=("isup", "first")
    ).reset_index()

    # Ensure repeated issues do not exceed total unique tickets
    grouped["num_high_critical_tickets"] = grouped["num_high_critical_tickets"].clip(upper=grouped["num_created_tickets"])
    grouped["median_num_previous_subject"] = grouped["median_num_previous_subject"].clip(upper=grouped["num_created_tickets"])
    grouped["max_num_previous_subject"] = grouped["max_num_previous_subject"].clip(upper=grouped["num_created_tickets"])

    # Month-over-Month (MoM) changes
    for col in [
        "num_created_tickets", "num_closed_tickets", "median_resolution_time", 
        "max_resolution_time", "median_response_time", "max_response_time", 
        "num_high_critical_tickets", "num_open_tickets"
    ]:
        grouped[f"mom_{col}"] = grouped.groupby(["account_id", "contract_id"])[col].diff()

    return grouped

# Example usage:
final_features_df = compute_monthly_features(contract_df, monthly_tickets_df)

final_features_df['contract_year'] = final_features_df['FOM'].dt.year

In [None]:
in the following calculation we need to aggregate based on every contract every month, therefore the field of :tickect_id,ticket_creation_date',
       'ticket_close_date', 'response_time', 'resolution_time', 'priority', 'ticket_subject', 'status', 'num_previous_tickets_same_subject' 
    should not exist in the final data frame as we aggregate all tickect level data to contract level per month
    could you also add 'ischr' and 'isup' from contract data to this set?

In [640]:
final_features_df['contract_year'] = final_features_df['FOM'].dt.year

In [642]:
final_features_df.columns

Index(['account_id', 'contract_id', 'MOC', 'POC', 'FOM', 'num_created_tickets',
       'num_closed_tickets', 'median_resolution_time', 'max_resolution_time',
       'median_response_time', 'max_response_time',
       'median_num_previous_subject', 'max_num_previous_subject',
       'num_high_critical_tickets', 'num_open_tickets', 'ischr', 'isup',
       'mom_num_created_tickets', 'mom_num_closed_tickets',
       'mom_median_resolution_time', 'mom_max_resolution_time',
       'mom_median_response_time', 'mom_max_response_time',
       'mom_num_high_critical_tickets', 'mom_num_open_tickets',
       'contract_year'],
      dtype='object')

In [644]:
pd.crosstab(final_features_df['median_num_previous_subject'], final_features_df['num_created_tickets'])

num_created_tickets,1,2,3,4
median_num_previous_subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2432,221,7,0
1,2081,604,64,1
2,0,1017,118,5
3,0,0,325,10
4,0,0,0,22


In [646]:
final_features_df.shape

(6907, 26)

In [648]:
final_features_df.to_csv("data/0_sim_support_tickets_monthly_features_aggs.csv", index=False)

In [659]:
features=['num_created_tickets', 'num_closed_tickets', 'median_resolution_time','max_resolution_time', 'median_response_time', 'max_response_time',
          'num_high_critical_tickets', 'median_num_previous_subject']
data=final_features_df.copy()
data.shape

(6907, 26)

In [665]:
# Filter data for ischr == True
data_chr = data[data['ischr'] == 1 ]
stats_chr = data_chr.groupby(['contract_year'])[features].agg(['min', 'mean', 'median', 'max', 'std']).reset_index()

# Filter data for has_next_contract == True
data_renew = data[data['isup'] == 1]
stats_renew = data_renew.groupby(['contract_year'])[features].agg(['min', 'mean', 'median', 'max', 'std']).reset_index()

# Initialize an empty dictionary to store results
stats_dict = {}

# Loop through both stats for 'ischr' and 'has_next_contract' and add to the dictionary
for stat_df, key in zip([stats_chr, stats_renew], ['ischr', 'isup']):
    # Loop through features and stats
    for feature in features:
        for stat in ['min', 'mean', 'median', 'max', 'std']:
            # Create a key combining feature, stat, contract_year, and 'ischr' or 'has_next_contract'
            for contract_year in stat_df['contract_year']:
                # Add each result to the dictionary
                stats_dict[(feature, stat, contract_year, key)] = stat_df.loc[stat_df['contract_year'] == contract_year, (feature, stat)].values[0]

In [667]:
#stats_dict
df = pd.DataFrame(
    [(feature, stat, year, category, value) for (feature, stat, year, category), value in stats_dict.items()],
    columns=['Feature', 'Statistic', 'Year', 'Category', 'Value']
)
df.to_csv("data/0_sim_support_tickects_yearly_fuzzy.csv", index=False)

In [675]:
# Iterate through each feature in features
for f in features:
    # Iterate through each row in data
    for idx, row in data.iterrows():
        year = row['contract_year'] - 1 if row['contract_year'] > data['contract_year'].min() else row['contract_year']
        
        # Get the median values from stats_dict for 'ischr' and 'isup'
        median_ischr = stats_dict.get((f, 'median', year, 'ischr'), None)
        median_renew = stats_dict.get((f, 'median', year, 'isup'), None)

        # Check if the median values exist, otherwise assign 'normal'
        if median_ischr is not None and row[f] < median_ischr:
            label = 'risky'
        elif median_renew is not None and row[f] > median_renew:
            label = 'healthy'
        elif median_ischr is not None and median_renew is not None and median_ischr <= row[f] <= median_renew:
            label = 'normal'
        else:
            label = 'noun'

        # Assign the label to the row (new column with label for that feature)
        data.at[idx, f + '_label'] = label
        data.at[idx, f + '_median_ischr'] = median_ischr
        data.at[idx, f + '_median_isup'] = median_renew


In [677]:
data.to_csv("data/0_sim_support_tickect_monthly_features_agg.csv", index=False)

In [679]:
data= pd.read_csv("data/0_sim_support_tickect_monthly_features_agg.csv")

In [681]:
data.shape

(6907, 50)

In [683]:
a=[]
for i in  data.columns.tolist():
    if ('label' in i ):
        a.append(i)
aa=a+['ischr','isup']

In [685]:
# Define a mapping function to convert categorical values into -1, 0, 1
def map_categories_to_numeric(value):
    if value == 'risky':  # Risky category mapped to -1
        return -1
    elif value == 'healthy':  # Healthy category mapped to 1
        return 1
    else:  # Neutral or normal categories mapped to 0
        return 0

# Apply the mapping function to the categorical columns

categorical_columns=[]
for i in data.columns:
    if '_label' in i :
        categorical_columns.append(i)
        
for col in categorical_columns:
    data[col] = data[col].apply(map_categories_to_numeric)

weights = {}
for col in a:
    # Calculate the correlation of each feature with the target variable 'ischr'
    correlation = data[col].corr(data['ischr'])
    weights[col] = correlation

# Create a DataFrame to display the feature weights
weights_df = pd.DataFrame(list(weights.items()), columns=['Feature', 'Weight'])
weights_df

wl=[]
for i in weights_df['Weight'].tolist():
    #wl.append(i/sum(l))
    wl.append(1/len(a))
data['support_score']=0
for i,j in zip(a,wl):
    data['support_score']=data['support_score']+data[i]*j

data['support_score_label'] = data['support_score'].apply(lambda x: 'healthy' if x > 0 else ('normal' if x == 0 else 'risky'))

In [687]:
data['support_score_label'].value_counts()

support_score_label
healthy    3046
risky      2957
normal      904
Name: count, dtype: int64

In [689]:
data.to_csv('data/1_score_support.csv', index=False)

In [691]:
data['contract_id'].unique()

array(['contract_1_1_1', 'contract_1_2_1', 'contract_1_2_10',
       'contract_1_3_6', 'contract_1_4_3', 'contract_11_1_1',
       'contract_11_1_2', 'contract_11_1_3', 'contract_11_1_4',
       'contract_11_1_5', 'contract_11_2_3', 'contract_11_1_6',
       'contract_11_2_4', 'contract_11_3_1', 'contract_11_1_7',
       'contract_11_3_2', 'contract_11_2_6', 'contract_11_3_3',
       'contract_11_1_9', 'contract_11_2_7', 'contract_11_3_4',
       'contract_11_1_10', 'contract_11_3_5', 'contract_2_1_1',
       'contract_2_2_1', 'contract_11_1_11', 'contract_11_1_12',
       'contract_11_2_10', 'contract_11_3_7', 'contract_11_4_1',
       'contract_11_1_13', 'contract_11_3_8', 'contract_12_1_2',
       'contract_12_1_3', 'contract_12_2_1', 'contract_12_1_4',
       'contract_12_2_3', 'contract_12_1_7', 'contract_12_2_5',
       'contract_2_1_2', 'contract_12_1_8', 'contract_12_2_6',
       'contract_12_1_9', 'contract_12_2_7', 'contract_12_3_1',
       'contract_12_1_10', 'contract_12_2_