In [1]:
import random
import numpy as np
import pandas as pd
# from scipy.optimize import basinhopping
# from sklearn.metrics import mutual_info_score
from tqdm.notebook import tqdm_notebook
import time

In [2]:
import sys
sys.path.insert(0, "../ddf/")
import stainer as ST
import DirtyDF as ddf

In [3]:
import importlib
importlib.reload(ST)

<module 'stainer' from '/Users/carelchay/PycharmProjects/dirty_dataframe/ddf/notebooks/../ddf/stainer.py'>

# Telco Example

In [4]:
df = pd.read_csv("../../data/Telco-Customer-Churn.csv")

In [5]:
# Need to manually convert to categorical types
df[df.columns.difference(['customerID', 'SeniorCitizen', 'tenure', 'MonthlyCharges', 
                          'TotalCharges'])]=\
    df[df.columns.difference(['customerID', 'SeniorCitizen', 'tenure', 'MonthlyCharges', 
                              'TotalCharges'])].astype('category')
    

In [6]:
# Create DDF object
telco_ddf = ddf.DirtyDF(df, seed=2101)

In [7]:
# Check for the column types
print(telco_ddf.cat_cols)
print(telco_ddf.num_cols)
print(telco_ddf.dt_cols)

[1, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 20]
[2, 5, 18, 19]
[]


In [8]:
df.dtypes

customerID            object
gender              category
SeniorCitizen          int64
Partner             category
Dependents          category
tenure                 int64
PhoneService        category
MultipleLines       category
InternetService     category
OnlineSecurity      category
OnlineBackup        category
DeviceProtection    category
TechSupport         category
StreamingTV         category
StreamingMovies     category
Contract            category
PaperlessBilling    category
PaymentMethod       category
MonthlyCharges       float64
TotalCharges         float64
Churn               category
dtype: object

In [9]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


## Stainers used for trial

1. Row Duplication (on `customerID`)
2. Mutual Information (on `Contract` & `Churn`)
3. Inflection (on `PaymentMethod` & `Contract`)
4. Nullify Stainer (on `TotalCharges`)

In [11]:
# Create DDF object
telco_ddf = ddf.DirtyDF(df, seed=2101)

## Instantiate Stainers ##

# Duplication Stainer
dupli_stainer=ST.RowDuplicateStainer(deg=0.2, max_rep=3) 
# Mutual Information
mut_stainer=ST.CatCorrStainer(col_idx=[15, 20], max_n=50000, min_inf=0.4)
# Inflection stainer. Need to specify column idxs in list
inflection_stainer=ST.InflectionStainer(num_format=3, col_idx=[15, 17])
# Nullify Stainer
null_stainer=ST.NullifyStainer(deg=0.25, col_idx=[18])

In [12]:
## Add Stainers ##
telco_ddf=telco_ddf.add_stainers([dupli_stainer, mut_stainer, inflection_stainer, null_stainer])

In [13]:
## Run Stainers ##
telco_ddf=telco_ddf.run_all_stainers()

In [14]:
telco_ddf.print_history()

1. Add Duplicates 
 Added Duplicate Rows for 1408 rows. 
  Each duplicated row should appear a maximum of 3 times. 
  Rows added: 2110 
 Time taken: 0.08050370216369629 

2. CatCorr 
 Old Mutual Information was 0.1401538385374521, new Mutual Information is 0.28488443942160435 
 Time taken: 3.7666208744049072 

3. Inflection 
 Categorical inflections on:
{'PaymentMethod': {'Electronic check': ['Electronic check', 'electronic check'], 'Mailed check': ['Mailed check', 'mailed check'], 'Bank transfer (automatic)': ['Bank transfer (automatic)', 'bank transfer (automatic)'], 'Credit card (automatic)': ['Credit card (automatic)', 'credit card (automatic)']}, 'Contract': {'Month-to-month': ['Month To Month', 'Month-to-month'], 'One year': ['One Year', 'One year'], 'Two year': ['Two Year', 'Two year']}} 
 Time taken: 0.01322484016418457 

4. Nullify 
 Replaced 2288 values to become empty in specificed rows/cols. 
 Time taken: 0.12383913993835449 



In [15]:
stained_df = telco_ddf.get_df()

In [17]:
pd.crosstab(stained_df['Churn'], 
           stained_df['Contract'])

Contract,Month To Month,Month-to-month,One Year,One year,Two Year,Two year
Churn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
No,1309,1292,949,987,1108,1096
Yes,1200,1212,0,0,0,0


In [18]:
pd.crosstab(df['Churn'], 
           df['Contract'])

Contract,Month-to-month,One year,Two year
Churn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
No,2220,1307,1647
Yes,1655,166,48
