<a href="https://colab.research.google.com/github/springboardmentor5432x/DV---Optimizing-IT-Support-Team-Performance-Using-Analytics-Supportlytics-/blob/main/Supportlytics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Load the file
df = pd.read_csv('/dataset-tickets-multi-lang-4-20k.csv')

# Create the missing columns so we can follow the project plan
df['Date'] = [datetime.now() - timedelta(days=np.random.randint(0, 30)) for _ in range(len(df))]
df['Resolution_Date'] = df['Date'] + pd.to_timedelta(np.random.randint(1, 6, size=len(df)), unit='D')
df['Similarity_Level'] = np.random.rand(len(df))
countries = ['USA', 'Germany', 'India', 'UK', 'France', 'Japan']
df['Country'] = np.random.choice(countries, size=len(df))

# Calculate Resolution_Time (The first requirement of Milestone 1!)
df['Resolution_Time'] = (df['Resolution_Date'] - df['Date']).dt.days

# Look at the first 5 rows to make sure it worked
print(df[['subject', 'Date', 'Resolution_Date', 'Resolution_Time', 'Country']].head())

# Save this "Ready" file so you can use it for your charts
df.to_csv('Supportlytics_Data_Ready.csv', index=False)

                                             subject  \
0  Unvorhergesehener Absturz der Datenanalyse-Pla...   
1                           Customer Support Inquiry   
2                      Data Analytics for Investment   
3                 Krankenhaus-Dienstleistung-Problem   
4                                           Security   

                        Date            Resolution_Date  Resolution_Time  \
0 2025-12-30 12:48:15.634225 2026-01-03 12:48:15.634225                4   
1 2025-12-20 12:48:15.634295 2025-12-24 12:48:15.634295                4   
2 2025-12-06 12:48:15.634304 2025-12-08 12:48:15.634304                2   
3 2025-12-16 12:48:15.634309 2025-12-21 12:48:15.634309                5   
4 2025-12-22 12:48:15.634314 2025-12-25 12:48:15.634314                3   

  Country  
0   Japan  
1   India  
2   Japan  
3      UK  
4     USA  


In [None]:
# 1. Check for missing values (How many empty spots are in each column?)
print("--- Missing Values Count ---")
print(df.isnull().sum())

# 2. Fill missing values in 'tag' columns with 'None' so they don't cause errors
# This is a key part of Data Cleaning!
df.fillna('Not Categorized', inplace=True)

# 3. Quick Stats: What is the average time it takes to fix a problem?
avg_time = df['Resolution_Time'].mean()
print(f"\nAverage Resolution Time: {avg_time:.2f} days")

# 4. Count tickets per Country
print("\n--- Tickets by Country ---")
print(df['Country'].value_counts())

--- Missing Values Count ---
subject              1461
body                    2
answer                  4
type                    0
queue                   0
priority                0
language                0
tag_1                   0
tag_2                  46
tag_3                  95
tag_4                1539
tag_5                6909
tag_6               12649
tag_7               16072
tag_8               18093
Date                    0
Resolution_Date         0
Similarity_Level        0
Country                 0
Resolution_Time         0
dtype: int64

Average Resolution Time: 2.99 days

--- Tickets by Country ---
Country
Japan      3403
France     3375
Germany    3354
USA        3317
India      3292
UK         3259
Name: count, dtype: int64
