In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
pd.options.display.max_columns = None

In [3]:
df = pd.read_csv("swan_data.csv")

In [4]:
def data_cleaning(df):
    #Making a copy of the dataset
    df = df.copy()

    #Using CustomerID as index
    df.set_index("CustomerID", inplace=True)

    #Fixing Total Charges column - inserting zeroes for blank columns and casting to float
    df["Total Charges"] = df["Total Charges"].str.replace(" ", "0")
    df = df.astype({"Total Charges": float})

    #Dropping unnecessary columns
    df.drop(columns=["Count", "Country", "State", "City", "Zip Code", "Lat Long", "Churn Label", "Churn Reason"], inplace=True)

    #Mapping columns to numeric values
    #General case, where values are "yes" and "no". Results like "no phone service" are mapped to 0 since the lack of phone service is contained in a different column
    general_mapper = {"No":0, "Yes":1, "No phone service": 0, "No internet service": 0}
    for col in df.columns:
        if "No" in df[col].unique() and "Yes" in df[col].unique():
            df[col] = df[col].map(general_mapper)
    
    #Mapping male to 0 and female to 1
    gender_mapper = {"Male":0, "Female":1}
    df["Gender"] = df["Gender"].map(gender_mapper)

    #Mapping contract lengths into a value representing the length of the contract term in years
    contract_mapper = {"Month-to-month": 1/12, "Two year": 2, "One year":1}
    df["Contract"] = df["Contract"].map(contract_mapper)

    return df


In [5]:
def min_max_scaling(df, train, col):
    minimum = train[col].min() # Min of train
    maximum = train[col].max() # Max of train
    df[col] = (df[col] - minimum) / (maximum-minimum) # Applying scaling
    return df[col] ## Returned scaled column


In [6]:
def feature_engineering(df,train):
    # Columns to scale
    scale_cols = ['Monthly Charges','Total Charges','Tenure Months']
    
    # Apply min-max scaling to columns listed above
    for col in scale_cols:
        df[col] = min_max_scaling(df, train, col)
    
    # OHE Payment Method and Internet Service columns
    df = pd.get_dummies(df, 
                        columns = ['Payment Method', 'Internet Service'], 
                        prefix = ['pay','is'], 
                        drop_first = True, 
                        dtype = int)
        
    return df

In [7]:
df = data_cleaning(df)

In [8]:
df = feature_engineering(df, df) #df to be replaced with (X_train, X_train) or (X_test, X_train)

In [9]:
df.head()

Unnamed: 0_level_0,Latitude,Longitude,Gender,Senior Citizen,Partner,Dependents,Tenure Months,Phone Service,Multiple Lines,Online Security,Online Backup,Device Protection,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Monthly Charges,Total Charges,Churn Value,pay_Credit card (automatic),pay_Electronic check,pay_Mailed check,is_Fiber optic,is_No
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
3668-QPYBK,33.964131,-118.272783,0,0,0,0,0.027778,1,0,1,1,0,0,0,0,0.083333,1,0.354229,0.012453,1,0,0,1,0,0
9237-HQITU,34.059281,-118.30742,1,0,0,1,0.027778,1,0,0,0,0,0,0,0,0.083333,1,0.521891,0.017462,1,0,1,0,1,0
9305-CDSKC,34.048013,-118.293953,1,0,0,1,0.111111,1,1,0,0,1,0,1,1,0.083333,1,0.80995,0.094475,1,0,1,0,1,0
7892-POOKP,34.062125,-118.315709,1,0,1,1,0.388889,1,1,0,0,1,1,1,1,0.083333,1,0.861194,0.350733,1,0,1,0,1,0
0280-XJGEX,34.039224,-118.266293,0,0,0,1,0.680556,1,1,0,1,1,0,1,1,0.083333,1,0.850249,0.579898,1,0,0,0,1,0


In [86]:
def apr(y_pred, y_real):
    accuracy = metrics.accuracy_score(y_real, y_pred)
    precision = metrics.precision_score(y_real, y_pred)
    recall = metrics.recall_score(y_real, y_pred)
    f1 = metrics.f1_score(y_real, y_pred)
    
    print(f"Accuracy:{accuracy}")
    print(f"Precision:{precision}")
    print(f"Recall:{recall}")
    print(f"F1:{f1}")
    return accuracy, precision, recall, f1