In [40]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
import pickle

def data_preprocessing(dataset:pd.DataFrame):
    df = dataset.copy()
    
    # make TotalCharges numeric                  
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
    df['TotalCharges'] = df['TotalCharges'].fillna(0)
    
    # prepare string variables
    df.columns = df.columns.str.lower().str.replace(' ','_')
    string_columns = list(df.dtypes[df.dtypes=='O'].index)
    for col in string_columns:
        df[col] = df[col].str.lower().str.replace(' ','_')
    
    # make dependent variable numeric
    df.churn = (df.churn == 'yes').astype(int)
    
    # drop duplicates
    df.drop_duplicates(inplace=True)

    return df

def FeatureEngineering(dataset:pd.DataFrame, categorical_variables:list, numerical_variables:list, target_variable:str):
    df = dataset.copy()
    categorical_important = categorical_variables.copy()
    numerical_important = numerical_variables.copy()
    dependent_variable = [target_variable.lower().replace(' ','_')]

    df = df[numerical_important + categorical_important]
    
    # Dummy variables
    OHE = make_column_transformer((OneHotEncoder(), categorical_important ),
                                        remainder='passthrough',
                                        verbose_feature_names_out=False)
    ohe = OHE.fit_transform(df)
    pickle.dump(ohe, open('OneHotEncoder.pkl','wb'))
    df = pd.DataFrame(ohe, columns=OHE.get_feature_names_out())

    # Feature scaling
    scaler = StandardScaler()
    scaler.fit(df)
    pickle.dump(scaler, open('scaler.pkl','wb'))
    df = scaler.transform(df)
    df = pd.DataFrame(df,columns=scaler.feature_names_in_)
    
    #add target variable
    df[dependent_variable] = dataset[dependent_variable]
    return df

In [41]:
df = pd.read_csv('Churn_prediction.csv')
categorical_important = ['contract', 'onlinesecurity', 'techsupport', 'internetservice']
numerical_important = ['tenure', 'monthlycharges', 'totalcharges']
target_variable = 'churn'

                        

In [42]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.2, random_state=666)

df_train.reset_index(inplace=True,drop=True)
df_test.reset_index(inplace=True,drop=True)


In [43]:
df_train = data_preprocessing(df_train)
df_train = FeatureEngineering(df_train,
                              categorical_variables=categorical_important,
                              numerical_variables=numerical_important,
                              target_variable='churn')
df_train

Unnamed: 0,contract_month-to-month,contract_one_year,contract_two_year,onlinesecurity_no,onlinesecurity_no_internet_service,onlinesecurity_yes,techsupport_no,techsupport_no_internet_service,techsupport_yes,internetservice_dsl,internetservice_fiber_optic,internetservice_no,tenure,monthlycharges,totalcharges,churn
0,0.908986,-0.518023,-0.563275,1.004269,-0.52298,-0.63501,1.019357,-0.52298,-0.646585,1.380985,-0.888655,-0.52298,-1.198368,-0.990255,-0.959225,1
1,0.908986,-0.518023,-0.563275,1.004269,-0.52298,-0.63501,1.019357,-0.52298,-0.646585,-0.724121,1.125296,-0.52298,-0.054658,0.541967,0.158079,0
2,0.908986,-0.518023,-0.563275,1.004269,-0.52298,-0.63501,-0.981011,-0.52298,1.546587,1.380985,-0.888655,-0.52298,-0.381433,-0.358359,-0.428069,0
3,0.908986,-0.518023,-0.563275,1.004269,-0.52298,-0.63501,1.019357,-0.52298,-0.646585,-0.724121,1.125296,-0.52298,-1.034981,0.316885,-0.774324,1
4,0.908986,-0.518023,-0.563275,1.004269,-0.52298,-0.63501,1.019357,-0.52298,-0.646585,-0.724121,1.125296,-0.52298,-0.340586,1.158857,0.054054,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5629,0.908986,-0.518023,-0.563275,1.004269,-0.52298,-0.63501,-0.981011,-0.52298,1.546587,1.380985,-0.888655,-0.52298,-1.280062,-0.528421,-0.985661,0
5630,0.908986,-0.518023,-0.563275,1.004269,-0.52298,-0.63501,1.019357,-0.52298,-0.646585,-0.724121,1.125296,-0.52298,-0.422279,0.737037,-0.190538,0
5631,0.908986,-0.518023,-0.563275,1.004269,-0.52298,-0.63501,1.019357,-0.52298,-0.646585,-0.724121,1.125296,-0.52298,1.334133,0.898762,1.614643,1
5632,0.908986,-0.518023,-0.563275,1.004269,-0.52298,-0.63501,1.019357,-0.52298,-0.646585,1.380985,-0.888655,-0.52298,-0.299739,-0.151618,-0.387377,0


In [47]:
y_train = df_train['churn']
del df_train['churn']

In [49]:
df_train.head().T

Unnamed: 0,0,1,2,3,4
contract_month-to-month,0.908986,0.908986,0.908986,0.908986,0.908986
contract_one_year,-0.518023,-0.518023,-0.518023,-0.518023,-0.518023
contract_two_year,-0.563275,-0.563275,-0.563275,-0.563275,-0.563275
onlinesecurity_no,1.004269,1.004269,1.004269,1.004269,1.004269
onlinesecurity_no_internet_service,-0.52298,-0.52298,-0.52298,-0.52298,-0.52298
onlinesecurity_yes,-0.63501,-0.63501,-0.63501,-0.63501,-0.63501
techsupport_no,1.019357,1.019357,-0.981011,1.019357,1.019357
techsupport_no_internet_service,-0.52298,-0.52298,-0.52298,-0.52298,-0.52298
techsupport_yes,-0.646585,-0.646585,1.546587,-0.646585,-0.646585
internetservice_dsl,1.380985,-0.724121,1.380985,-0.724121,-0.724121


<h2>Logistic Regression</h2>

In [50]:
from sklearn.linear_model import LogisticRegression
clf_LogisticRegression = LogisticRegression(random_state=0)
clf_LogisticRegression.fit(df_train, y_train)
from sklearn.model_selection import cross_val_score

# accuracies = cross_val_score(estimator=clf_LogisticRegression, X=df_train, y=y_train, cv=6, scoring='accuracy')
# print("Accuracy Mean: {0}, Accuracy standard deviation: {1}".format(accuracies.mean(),accuracies.std()))


y_pred = clf_LogisticRegression.predict(df_train)



<h2>TEST</h2>

In [None]:
from sklearn.metrics import confusion_matrix
cm =confusion_matrix(y_test, y_pred)
print(cm)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print([tn, fp, fn, tp])

[[924  90]
 [217 178]]
[924, 90, 217, 178]
