In [20]:
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport 

### DATA

In [21]:
fp = '../local-data/input/train.csv'
df = pd.read_csv(fp,
                 dtype=str,
                 encoding='ISO-8859-1', # to prevent unicode error
                 keep_default_na=True,
                 dayfirst=True,
#                  parse_dates = date_cols
                )
df.shape

(891, 12)

In [22]:
data = df.copy()
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [23]:
missing = df.copy().isna().sum()
(missing.reset_index(name='cnt')
 .sort_values(by='cnt', ascending=True))

Unnamed: 0,index,cnt
0,PassengerId,0
1,Survived,0
2,Pclass,0
3,Name,0
4,Sex,0
6,SibSp,0
7,Parch,0
8,Ticket,0
9,Fare,0
11,Embarked,2


#### Replace field that's entirely space (or empty) with NaN

In [24]:
# data = data.replace(r'^\s*$', np.nan, regex=True)

#### Convert column with digits to string type

In [25]:
# data['u_alternative_telephone_no'] = data['u_alternative_telephone_no'].astype('str')

#### Create sample dataset

In [26]:
# samp = data.sample(10000)
# output_path = '../local-data/samples/incidents_all_ready_for_profiling_SAMPLE.csv'
# samp.to_csv(output_path, index=True)

#### Inspect dates

In [27]:
# min_date = min(data['closed_at'])
# max_date = max(data['closed_at'])
# print('min date:{}'.format(min_date))
# print('max date:{}'.format(max_date))

#### Drop zero variance columns

In [28]:
# get the count of unique values for each column
nunique = data.apply(pd.Series.nunique)

# drop columns that have only one unique value (zero variance)
zero_var_cols = nunique[nunique == 1].index
data = data.drop(zero_var_cols, axis=1)
data.shape

(891, 12)

In [29]:
print(sorted(set(zero_var_cols)), end='')

[]

#### Drop cols with all missing values

In [30]:
df[df.columns[df.isnull().mean() < 0.8]]
df.columns[df.isnull().mean() > 0.1]

Index(['Age', 'Cabin'], dtype='object')

In [31]:
print(sorted(set(empty_cols)), end='')

['Age', 'Cabin', 'Embarked']

#### View retained columns list

In [32]:
print(len(data.columns))
print(sorted(set(data.columns)), end='')

12
['Age', 'Cabin', 'Embarked', 'Fare', 'Name', 'Parch', 'PassengerId', 'Pclass', 'Sex', 'SibSp', 'Survived', 'Ticket']

In [19]:
# output_path = "../local-data/output/incidents_all_ready_for_profiling.csv"
# data.to_csv(output_path, index=True)

#### Profile the data

In [34]:
profile = ProfileReport(data,
                        minimal=True,
                        title='Data Profiling Report', 
                        html={'style':{'full_width':True}}) 
# profile.to_widgets()
profile.set_variable("html.minify_html", False)
profile.to_file("../local-data/profiling_reports/profiling.html")

HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=21.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Export report to file', max=1.0, style=ProgressStyle(desc…




### Misc.

In [34]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 354747 entries, INC1021545 to INC1447567
Columns: 119 entries, u_opened_date to u_perspectium_correlation_id_display
dtypes: datetime64[ns](6), object(113)
memory usage: 324.8+ MB


**Proportions values in columns**

In [14]:
# data['dataCenter'].value_counts(normalize=True) * 100

#### Plotting Incident vs. Service Request count by Country

In [15]:
# # tmp2 = tmp[(tmp.incident_type=='Incident') & (tmp.country=='Brazil')].copy()

# fig, axes = plt.subplots(figsize=(20,10))
# sns.set(context="paper", font="monospace")
# sns.set_style("white")

# plt.title('Title');
# sns.distplot(data.startTimeStamp, kde=False);
# plt.xaxis.set_tick_params(rotation=45)

In [16]:
# tmp2 = tmp[(tmp.incident_type=='Service Request') & (tmp.country=='Brazil')].copy()

# fig, axes = plt.subplots(figsize=(20,10))
# sns.set(context="paper", font="monospace")
# sns.set_style("white")

# plt.title('Title');
# ax = (sns.barplot(x=tmp2.category2, y=tmp2.cnt, color='lightblue')
#     )
# _ = ax.set_xticklabels(ax.get_xticklabels(),rotation=90)

In [17]:
# df.describe().transpose()

In [18]:
# dat.info(verbose=True)

In [19]:
# list(dat['category'].unique())[:5]

In [20]:
# tmp = dat.copy()
# tmp = tmp[tmp.category=='Application/Services']
# tmp['description'].head()

In [21]:
# dat['subcategory'].unique()

In [22]:
# dat['close_description'].unique()

In [23]:
# country breakdown
# df = dat.copy()
# tmp = (df.groupby('country')['incident_id']
#        .count()
#        .reset_index(name='cnt') 
#       )

# tot = tmp.cnt.sum()
# tmp['pct'] = tmp.apply(lambda x: x['cnt'] / tot * 100,
#                       axis=1)
# tmp.sort_values(by='cnt', ascending=False).head()

In [24]:
# dat['bat_5tc_service'].unique()

In [25]:
# df = dat.copy()
# df = df[df.country.isin(['Russian Federation'])]
# tmp = pd.crosstab(df.bat_5tc_service.astype(str),
#                   df.country, 
#                   values=df.incident_id, 
#                   aggfunc=pd.Series.nunique,
#                   margins=True,
#                   dropna=False)#.fillna(value=0)
# tmp.sort_values(by='All', ascending=False).head()

In [26]:
# df = dat.copy()
# df = df[df.country.isin(['Brazil'])]
# tmp = pd.crosstab(df.bat_5tc_service.astype(str),
#                   df.country, 
#                   values=df.incident_id, 
#                   aggfunc=pd.Series.nunique,
#                   margins=True)#.fillna(value=0)
# tmp.sort_values(by='All', ascending=False).head()

In [27]:
# df = dat.copy()
# df = df[df.country.isin(['United Kingdom'])]
# tmp = pd.crosstab(df.bat_5tc_service.astype(str),
#                   df.country, 
#                   values=df.incident_id, 
#                   aggfunc=pd.Series.nunique,
#                   margins=True)#.fillna(value=0)
# tmp.sort_values(by='All', ascending=False).head()

In [28]:
# list(df['short_description'].iloc[:3])

In [29]:
# df = dat.copy()
# df = df[df.country.isin(['Russian Federation'])]
# print('nbr rows w/missing description: {}'.format(df['description'].isna().sum()))
# df = df[~df.description.isna()]
# list(df['description'].iloc[:3])


In [30]:
# list(df['description.1'].iloc[:3])

In [31]:
# list(df['close_description'].iloc[:10])