In [13]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
import pickle

def data_preprocessing(dataset:pd.DataFrame):
    df = dataset.copy()
    
    # prepare string variables
    df.columns = df.columns.str.lower().str.replace(' ','_')
    string_columns = list(df.dtypes[df.dtypes=='O'].index)
    for col in string_columns:
        df[col] = df[col].str.lower().str.replace(' ','_')
    
    # make TotalCharges numeric                  
    df['totalcharges'] = pd.to_numeric(df['totalcharges'], errors='coerce')
    df['totalcharges'] = df['totalcharges'].fillna(0)
    
    # make dependent variable numeric
    df.churn = (df.churn == 'yes').astype(int)
    
    # drop duplicates
    df.drop_duplicates(inplace=True)

    return df

def FeatureEngineering(dataset:pd.DataFrame, categorical_variables:list, numerical_variables:list, target_variable:str):
    df = dataset.copy()
    categorical_important = categorical_variables.copy()
    numerical_important = numerical_variables.copy()
    dependent_variable = [target_variable.lower().replace(' ','_')]

    df = df[numerical_important + categorical_important]
    
    # Dummy variables
    #   -read OHE_feature_names_out
    OHE_feature_names_out = []
    datam = open("OHE_feature_names_out.txt", "r")
    for x in datam:
        OHE_feature_names_out.append(x)
    OHE_feature_names_out = list(map(lambda x : x[:-1],OHE_feature_names_out))
    datam.close()

    ohe = pickle.load(open('OneHotEncoder.pkl','rb'))
    df = pd.DataFrame(ohe, columns=OHE_feature_names_out)

    # Feature scaling
    scaler = pickle.load(open('scaler.pkl','rb'))
    df = scaler.transform(df)
    df = pd.DataFrame(df,columns=scaler.feature_names_in_)
    
    #add target variable
    df[dependent_variable] = dataset[dependent_variable]
    return df

In [14]:
df = pd.read_csv('Churn_prediction.csv')

from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.2, random_state=666)

df_test.reset_index(inplace=True,drop=True)

categorical_important = ['contract', 'onlinesecurity', 'techsupport', 'internetservice']
numerical_important = ['tenure', 'monthlycharges', 'totalcharges']

In [15]:
df_train = data_preprocessing(df_train)
df_train = FeatureEngineering(df_train,
                              categorical_variables=categorical_important,
                              numerical_variables=numerical_important,
                              target_variable='Churn')

y_train = df_train['churn']
del df_train['churn']

ValueError: Shape of passed values is (7043, 15), indices imply (7043, 16)

In [None]:
df_test = data_preprocessing(df_test)
df_test = FeatureEngineering(df_test,
                              categorical_variables=categorical_important,
                              numerical_variables=numerical_important,
                              target_variable='churn')
y_test = df_test['churn']
del df_test['churn']

# model
from sklearn.linear_model import LogisticRegression
clf_LogisticRegression = LogisticRegression(random_state=0)
clf_LogisticRegression.fit(df_train, y_train)

# clf_GB = GradientBoostingClassifier(n_estimators=20, learning_rate=0.1, max_depth=3)
# clf_GB.fit(df_train, y_train)



pred = clf_LogisticRegression.predict(df_test)

from sklearn.metrics import roc_auc_score
# 0.6610603450428183