In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from datetime import datetime

In [None]:
df = pd.read_csv('../input/incident-response-log/incident_event_log.csv')

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.shape

In [None]:
columns = df.columns

## replacing the '?' values with np.nan

In [None]:
df.replace('?', np.nan, inplace=True)


In [None]:
df.isna().sum()

# functions to use later

In [None]:
def basic_data(column):
    print('nunique: ', column.nunique())
    print('\nunique: ', column.unique())
    print('\nNull values: ', column.isna().sum())

## Changing the datatype for the opening and closing columns

In [None]:
columns_to_change_to_datetime = ['closed_at', 'opened_at' ]

for i in columns_to_change_to_datetime:
    df[i] = df[i].astype('datetime64[ns]')

In [None]:
df.dtypes

In [None]:
# life of a ticket in seconds
df['ticket_life'] = (df['closed_at'] - df['opened_at']).dt.total_seconds()

## Ticekts those are opened after the closing date

In [None]:
# tickets opened after closing
# Filtering out the rows where the opened_at > closed_at

negative_life = df[df.ticket_life<0 ]
negative_life.shape

In [None]:
perc = negative_life.shape[0]/df.shape[0]*100
perc

 38.6% of tickets have negative life. We will handle these tickets seperately and not use the same in the model building as this might affect the performance of the model.

In [None]:
negative_life.head()

## Negative life tickets with reopen count less than 1

In [None]:
# tickets with negative life
negative_life[negative_life['reopen_count']<1].head()

In [None]:
# total tickets with negative life and reopen count = 0
negative_life[negative_life['reopen_count']<1].shape

As we can see that when a ticket is reopened, the open date of the ticket is not necessarily changed. Hence this is not the reson for negative life of the ticket. 

In [None]:
negative_life[negative_life.reassignment_count > 1].shape

In [None]:
df[df.reassignment_count<1].shape

In [None]:
# df[df.number.sum()>0].shape

In [None]:
len(df.number)

In [None]:
df.number.nunique()

In [None]:
df.groupby(by='number').get_group(df.number[0])

# Genuine Tickets

In [None]:
df = df[df.ticket_life>0]

In [None]:
df.shape

In [None]:
df.sample()

In [None]:
print('percentage of missing values'.upper())
df.isna().sum()/df.shape[0]*100

## dropping colmns with more than 90% missing values

In [None]:
drop = ['caused_by', 'rfc', 'vendor','cmdb_ci','problem_id']
df.drop(drop,axis=1,inplace=True)

In [None]:
df.shape

## Fillna

### Avg time b/w open time and sys_creted_at

In [None]:
temp = df[['opened_at', 'sys_created_at']]
temp = temp.dropna().astype('datetime64[ns]')

In [None]:
temp['gap'] = temp.sys_created_at - temp.opened_at

In [None]:
import datetime
temp_mean = temp[temp.gap > datetime.timedelta(hours=0,minutes=0,seconds=0)].gap.mean()
temp_mean

#### Adding mean obtained above to the opened time to fill the null values in the sys_created_at

In [None]:
# df['sys_created_at'].fillna(value=(df.opened_at+temp_mean)).mean()
from datetime import timedelta

df['sys_created + temp_mean'] = df.opened_at + timedelta(minutes = 9, seconds=52.656746788)

In [None]:
df.sys_created_at.fillna(value = df['sys_created + temp_mean'],inplace=True)

### Avg time between opening and resolved time

In [None]:
temp2 = df[['opened_at', 'resolved_at']]

In [None]:
temp2a = temp2.dropna()

In [None]:
temp2a.resolved_at = temp2a.resolved_at.astype('datetime64[ns]')

In [None]:
temp2a['new'] = temp2a.resolved_at - temp2a.opened_at

In [None]:
temp2a.new.mean()

#### Adding the mean obtained above to the open time to fill the null values in the resolved at column

In [None]:
import datetime
df['new2'] = df['opened_at'] + datetime.timedelta(days=5,hours = 5, minutes=34, seconds=59.468314879)

In [None]:
df['resolved_at'].fillna(value = df.new2,inplace=True)

In [None]:
df.isna().sum()

Now, since all the null values are replaced, we can move ahead with encoding the features

### Filling null values in other columns

#### Creating new class for the missing columns

In [None]:
# creating a neww class for missing values  for caller id
df['caller_id'].fillna(value='caller 4340',inplace=True)

# creating a neww class for missing values for opened by
df['opened_by'].fillna(value = 'Opened by 4341', inplace=True)

# creating a neww class for missing values for sys created by
df['sys_created_by'].fillna(value='Created by 4342', inplace=True)

# creating a neww class for missing values for location
df['location'].fillna('Location 4343', inplace=True)

# creating a neww class for missing values for category
df['category'].fillna('Category 4344',inplace=True)

# creating a neww class for missing values for subcategory
df['subcategory'].fillna(value='Subcategory 4345',inplace=True)

# creating a neww class for missing values for symptoms
df['u_symptom'].fillna(value='Symptom 4346', inplace=True)

# creating a neww class for missing values for assignment group
df['assignment_group'].fillna(value='Group 4347',inplace=True)

# creating a neww class for missing values for resolver
df['assigned_to'].fillna(value='Resolver 4348',inplace=True)

# creating a neww class for missing values for problem id
# df['problem_id'].fillna(value='Promlem ID 4349',inplace=True)

#### Filling null values of a few columns with mode

In [None]:
# replacing with mode
df['closed_code'].fillna(df['closed_code'].mode()[0],inplace=True)
df['resolved_by'].fillna(df['resolved_by'].mode()[0],inplace=True)

In [None]:
df.isna().sum()

## Encoding the features

Using Label encoder to encode the Nominal features

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le

In [None]:
columns_to_be_label_encoded = ['active',
                         'made_sla',
                         'contact_type',
                         'knowledge',
                         'u_priority_confirmation',
                         'notify',
                         'incident_state',
                         'caller_id',
                         'opened_by',
                         'sys_created_by',
                         'sys_updated_by',
                         'location',
                         'category',
                         'subcategory',
                         'u_symptom',
                         'assignment_group',
                         'assigned_to',
                         'closed_code',
                         'resolved_by',
                         'number']

In [None]:
for i in columns_to_be_label_encoded:
    df[i] = le.fit_transform(df[i])
    le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    print(i.upper(),'\n', le_name_mapping)
    print('*'*150)

In [None]:
df.dtypes

### using ordinal Encoding for the ordinal Data 

In [None]:
columns_to_be_ordinal_encoded = ['impact','urgency','priority']
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder()

for i in columns_to_be_ordinal_encoded:
    df[i] = oe.fit_transform(df[i].values.reshape(-1,1))
#     le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
#     print(i, le_name_mapping)

In [None]:
df.dtypes

## Converting the datatype of the datetime columns

In [None]:
columns_for_obj_to_datetime = ['sys_created_at',
                              'sys_updated_at',
                              'resolved_at']

In [None]:
for i in columns_for_obj_to_datetime:
    df[i] = df[i].astype('datetime64[ns]')

In [None]:
df.dtypes

Now the data is ready for analysis. We perform analysis in the next notebook