In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
dataset_name = 'churn'

In [3]:
input_dir = './raw/'
inp_fname = 'churn.csv'
output_dir = f'./../../processed/{dataset_name}/'
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')


## Read Data

In [4]:
data = pd.read_csv(os.path.join(input_dir, inp_fname))
data.head()

Unnamed: 0,state,account_length,area_code,phone_number,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,...,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,class
0,16.0,128.0,b'415',2845.0,b'0',b'1',25.0,265.1,110.0,45.07,...,99.0,16.78,244.7,91.0,11.01,10.0,3.0,2.7,b'1',b'0'
1,35.0,107.0,b'415',2301.0,b'0',b'1',26.0,161.6,123.0,27.47,...,103.0,16.62,254.4,103.0,11.45,13.7,3.0,3.7,b'1',b'0'
2,31.0,137.0,b'415',1616.0,b'0',b'0',0.0,243.4,114.0,41.38,...,110.0,10.3,162.6,104.0,7.32,12.2,5.0,3.29,b'0',b'0'
3,35.0,84.0,b'408',2510.0,b'1',b'0',0.0,299.4,71.0,50.9,...,88.0,5.26,196.9,89.0,8.86,6.6,7.0,1.78,b'2',b'0'
4,36.0,75.0,b'415',155.0,b'1',b'0',0.0,166.7,113.0,28.34,...,122.0,12.61,186.9,121.0,8.41,10.1,3.0,2.73,b'3',b'0'


In [5]:
id_col = "id"
target_col = "class"

## Insert Id Column

In [6]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())


   id  state  account_length area_code  phone_number international_plan  \
0   0   16.0           128.0    b'415'        2845.0               b'0'   
1   1   35.0           107.0    b'415'        2301.0               b'0'   
2   2   31.0           137.0    b'415'        1616.0               b'0'   
3   3   35.0            84.0    b'408'        2510.0               b'1'   
4   4   36.0            75.0    b'415'         155.0               b'1'   

  voice_mail_plan  number_vmail_messages  total_day_minutes  total_day_calls  \
0            b'1'                   25.0              265.1            110.0   
1            b'1'                   26.0              161.6            123.0   
2            b'0'                    0.0              243.4            114.0   
3            b'0'                    0.0              299.4             71.0   
4            b'0'                    0.0              166.7            113.0   

   ...  total_eve_calls  total_eve_charge  total_night_minutes  \
0 

## Convert byte strings to strings

In [7]:
byte_string_columns = data.select_dtypes(include=['O']).columns
byte_string_columns.tolist()

['area_code',
 'international_plan',
 'voice_mail_plan',
 'number_customer_service_calls',
 'class']

In [8]:
import ast
def convert_byte_string_repr(entry):
    try:
        # Check if the entry looks like a byte string representation
        if isinstance(entry, str) and entry.startswith("b'") and entry.endswith("'"):
            byte_value = ast.literal_eval(entry)
            return byte_value.decode('utf-8')
    except (ValueError, SyntaxError):
        pass
    return entry  # Return the original entry if conversion fails

data[byte_string_columns] = data[byte_string_columns].applymap(convert_byte_string_repr).astype(str)
data.head()

Unnamed: 0,id,state,account_length,area_code,phone_number,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,...,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,class
0,0,16.0,128.0,415,2845.0,0,1,25.0,265.1,110.0,...,99.0,16.78,244.7,91.0,11.01,10.0,3.0,2.7,1,0
1,1,35.0,107.0,415,2301.0,0,1,26.0,161.6,123.0,...,103.0,16.62,254.4,103.0,11.45,13.7,3.0,3.7,1,0
2,2,31.0,137.0,415,1616.0,0,0,0.0,243.4,114.0,...,110.0,10.3,162.6,104.0,7.32,12.2,5.0,3.29,0,0
3,3,35.0,84.0,408,2510.0,1,0,0.0,299.4,71.0,...,88.0,5.26,196.9,89.0,8.86,6.6,7.0,1.78,2,0
4,4,36.0,75.0,415,155.0,1,0,0.0,166.7,113.0,...,122.0,12.61,186.9,121.0,8.41,10.1,3.0,2.73,3,0


## Drop unuseful columns

In [9]:
unuseful_columns = [col for col in data.columns if data[col].nunique() == 1]
unuseful_columns.append('phone_number')
print(unuseful_columns)
data.drop(columns=unuseful_columns, inplace=True)

['phone_number']


## Convert ? to NaN

In [10]:
data.replace('?', np.nan, inplace=True)

## Save Main Data File

In [11]:
data.to_csv(outp_fname, index=False)