# ***Simple Data Cleaning***

### Process Highlights:
- Customer Data: direct standardization of dates, NULL columns replaced with "unknown"
- Usage Logs: missing European September logs filled with August observations (forward filling)
- Support Tickets: creation timestamps converted to datetime

### Imports

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

### Load Data

In [2]:
df_customers = pd.read_csv('../data/dataset1.csv')
df_usage_q1q2 = pd.read_csv('../data/dataset2a for q1 q2.csv')
df_usage_q3q4 = pd.read_csv('../data/dataset2b for q3 q4.csv')
df_tickets = pd.read_csv('../data/dataset3.csv')


### Customer Data Cleaning

In [3]:
# Fix corrupted company size buckets
size_map = {
    '10-Jan': '1-10',
    'Nov-50': '11-50'
}
df_customers['company_size_bucket'] = df_customers['company_size_bucket'].replace(size_map)

# Fill missing industry
df_customers['industry'] = df_customers['industry'].fillna('Unknown')

# Convert dates
df_customers['contract_start_date'] = pd.to_datetime(df_customers['contract_start_date'])
df_customers['contract_end_date'] = pd.to_datetime(df_customers['contract_end_date'])

### Usage Logs Cleaning

In [4]:
# Merge datasets
df_usage = pd.concat([df_usage_q1q2, df_usage_q3q4], ignore_index=True)

# Convert dates to datetime
df_usage['date'] = pd.to_datetime(df_usage['date'])

# Handle corrupted EU data by sorting by customer and date 
df_usage = df_usage.sort_values(['customer_id', 'date'])

# Forward fill missing usage data
df_usage['logins'] = df_usage['logins'].ffill()
df_usage['feature_events'] = df_usage['feature_events'].ffill()
df_usage['session_minutes'] = df_usage['session_minutes'].ffill()

### Support Tickets Cleaning

In [5]:
# Convert ticket creation column to datetime
df_tickets['created_at'] = pd.to_datetime(df_tickets['created_at'])

### Save Cleaned Data

In [6]:
df_customers.to_csv('../data/simple_cleaned_customer_data.csv', index=False)
df_usage.to_csv('../data/simple_cleaned_usage_logs.csv', index=False)
df_tickets.to_csv('../data/simple_cleaned_tickets.csv', index=False)