In [13]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
import pandas as pd



base_df = pd.read_csv("/home/samir/Desktop/rudraAnalytics/sub_projects/churn/data.csv")
pd.set_option("display.max_columns", None)

df = base_df.copy()
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['SeniorCitizen'] = df['SeniorCitizen'].astype(str)


def one_hot_encode(df, encode_set=[], dont_encode=[]):
    encoded_df = df.copy()
    for column in encode_set:
        if column in dont_encode:
            continue
        # print(len(df[column].unique()))
        if df[column].dtype == 'object':
            if len(df[column].unique()) == 2:
                # Binary encoding (0 and 1)
                encoded_df[column] = pd.get_dummies(df[column], drop_first=True)
            else:
                # One-hot encoding and using 0 and 1 instead of True and False
                one_hot_encoded = pd.get_dummies(df[column], prefix=column, drop_first=False)
                one_hot_encoded.columns = [f"{column}{i+1}" for i in range(one_hot_encoded.shape[1])]
                encoded_df = pd.concat([encoded_df, one_hot_encoded], axis=1)
                encoded_df.drop(column, axis=1, inplace=True)
                # print(encoded_df)

    return encoded_df

features = [
    'customerID',
    'gender',
    'SeniorCitizen',
    'Partner',
    'Dependents',
    'tenure',
    'PhoneService',
    'MultipleLines',
    'InternetService',
    'OnlineSecurity',
    'OnlineBackup',
    'DeviceProtection',
    'TechSupport',
    'StreamingTV',
    'StreamingMovies',
    'Contract',
    'PaperlessBilling',
    'PaymentMethod',
    'MonthlyCharges',
    'TotalCharges',
    'Churn'
]
dont_label = ['customerID', 'tenure', 'MonthlyCharges', 'TotalCharges']

encoded_df = one_hot_encode(df.drop('customerID', axis=1), features, dont_label)
encoded_df['TotalCharges'].fillna(encoded_df['TotalCharges'].mean(), inplace=True)

X = encoded_df.drop('Churn', axis=1)
y = encoded_df['Churn']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the Logistic Regression model
logistic_model = LogisticRegression(random_state=42)

# Train the model
logistic_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_logistic = logistic_model.predict(X_test)

# Evaluate the model
accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
print(f"Logistic Regression Test Accuracy: {accuracy_logistic * 100:.2f}%")


Logistic Regression Test Accuracy: 82.33%


#Logistic Regression Test Accuracy: 82.33%