In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
pd.options.display.max_columns = None

In [3]:
df = pd.read_csv("swan_data.csv")

In [4]:
def data_cleaning(df):
    #Making a copy of the dataset
    df = df.copy()

    #Using CustomerID as index
    df.set_index("CustomerID", inplace=True)

    #Fixing Total Charges column - inserting zeroes for blank columns and casting to float
    df["Total Charges"] = df["Total Charges"].str.replace(" ", "0")
    df = df.astype({"Total Charges": float})

    #Dropping unnecessary columns
    df.drop(columns=["Count", "Country", "State", "City", "Zip Code", "Lat Long", "Churn Label", "Churn Reason"], inplace=True)

    #Mapping columns to numeric values
    #General case, where values are "yes" and "no". Results like "no phone service" are mapped to 0 since the lack of phone service is contained in a different column
    general_mapper = {"No":0, "Yes":1, "No phone service": 0, "No internet service": 0}
    for col in df.columns:
        if "No" in df[col].unique() and "Yes" in df[col].unique():
            df[col] = df[col].map(general_mapper)
    
    #Mapping male to 0 and female to 1
    gender_mapper = {"Male":0, "Female":1}
    df["Gender"] = df["Gender"].map(gender_mapper)

    #Mapping contract lengths into a value representing the length of the contract term in years
    contract_mapper = {"Month-to-month": 1/12, "Two year": 2, "One year":1}
    df["Contract"] = df["Contract"].map(contract_mapper)

    #One Hot Encoding the internet service and payment method columns
    df = pd.get_dummies(df, columns=["Internet Service"], dtype=int, prefix="is", drop_first=True)
    df = pd.get_dummies(df, columns=["Payment Method"], dtype=int, prefix="pay", drop_first=True)

    return df


In [6]:
df = data_cleaning(df)

In [7]:
df.head()

Unnamed: 0_level_0,Latitude,Longitude,Gender,Senior Citizen,Partner,Dependents,Tenure Months,Phone Service,Multiple Lines,Online Security,Online Backup,Device Protection,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Monthly Charges,Total Charges,Churn Value,is_Fiber optic,is_No,pay_Credit card (automatic),pay_Electronic check,pay_Mailed check
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
3668-QPYBK,33.964131,-118.272783,0,0,0,0,2,1,0,1,1,0,0,0,0,0.083333,1,53.85,108.15,1,0,0,0,0,1
9237-HQITU,34.059281,-118.30742,1,0,0,1,2,1,0,0,0,0,0,0,0,0.083333,1,70.7,151.65,1,1,0,0,1,0
9305-CDSKC,34.048013,-118.293953,1,0,0,1,8,1,1,0,0,1,0,1,1,0.083333,1,99.65,820.5,1,1,0,0,1,0
7892-POOKP,34.062125,-118.315709,1,0,1,1,28,1,1,0,0,1,1,1,1,0.083333,1,104.8,3046.05,1,1,0,0,1,0
0280-XJGEX,34.039224,-118.266293,0,0,0,1,49,1,1,0,1,1,0,1,1,0.083333,1,103.7,5036.3,1,1,0,0,0,0
