forked from jb-ds2020/nf-ds3-capstone-churn-prevention
-
Notifications
You must be signed in to change notification settings - Fork 0
/
eda_methods.py
119 lines (102 loc) · 3.41 KB
/
eda_methods.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
"""
use this as im ported libary
import eda_methods as eda
"""
import pandas as pd
import seaborn as sns
import numpy as np
from matplotlib import pyplot as plt
def meta(df, transpose=True):
"""
This function returns a dataframe that lists:
- column names
- nulls abs
- nulls rel
- dtype
- duplicates
- number of diffrent values (nunique)
"""
metadata = []
dublicates = sum([])
for elem in df.columns:
# Counting null values and percantage
null = df[elem].isnull().sum()
rel_null = round(null/df.shape[0]*100, 2)
# Defining the data type
dtype = df[elem].dtype
# Check dublicates
duplicates = df[elem].duplicated().any()
# Check number of nunique vales
nuniques = df[elem].nunique()
# Creating a Dict that contains all the metadata for the variable
elem_dict = {
'varname': elem,
'nulls': null,
'percent': rel_null,
'dtype': dtype,
'dup': duplicates,
'nuniques': nuniques
}
metadata.append(elem_dict)
meta = pd.DataFrame(metadata, columns=['varname', 'nulls', 'percent', 'dtype', 'dup', 'nuniques'])
meta.set_index('varname', inplace=True)
meta = meta.sort_values(by=['nulls'], ascending=False)
if transpose:
return meta.transpose()
print(f"Shape: {df.shape}")
return meta
def data_loss(df_clean, df_raw):
"""
This function returns the data loss in percent.
"""
return f"{round((df_clean.shape[0]/df_raw.shape[0])*100,3)}% data loss"
def describe_plus(df, transpose=True):
"""
This function returns a dataframe based on describ() function added:
- skew()
- kurtosis()
- variance
"""
statistics = pd.DataFrame(df.describe())
skew = pd.Series(df.skew())
kurtosis = pd.Series(df.kurtosis())
variance = pd.Series(df.var())
statistics.loc['skew'] = skew
statistics.loc['kurtosis'] = kurtosis
statistics.loc['variance'] = variance
if transpose:
return round(statistics.transpose(), 2)
return round(statistics, 2)
def correlogram(df):
"""
This function plots a correlogram.
"""
#Plot
fig, ax = plt.subplots(figsize=(15, 10))
mask = np.triu(df.corr())
ax = sns.heatmap(round(df.corr()*100, 0),
annot=True,
mask=mask, cmap="coolwarm")
return df.corr().round(2)
def plot_train_test_split(y, y_train, y_test):
"""
This function plots the the sizes of training and test set.
Also you will get a dataframe with the number of values and the relative distribution.
"""
# plot
y.plot.hist()
y_train.plot.hist()
y_test.plot.hist()
# dataframe with relative and absolut values
plt.legend(['all', 'train', 'test'])
storage = pd.DataFrame()
storage['train abs'] = round(y_train.value_counts(), 2)
storage['train %'] = round((y_train.value_counts()/y_train.shape[0]), 2)
storage['test abs'] = round(y_test.value_counts(), 2)
storage['test %'] = round((y_test.value_counts()/y_test.shape[0]), 2)
storage['all abs'] = round(y.value_counts(), 2)
storage['all %'] = round((y.value_counts()/y.shape[0]), 2)
# prints informations about splits
print ("Training set has {} samples.".format(y_train.shape[0]))
print ("Testing set has {} samples.".format(y_test.shape[0]))
return storage