In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
#this will install statsmodel
%pip install statsmodels
from statsmodels.discrete.discrete_model import Logit
from statsmodels.tools import add_constant

#load the dataset (remember to apply path from ur directory)
file_path = r"C:\Users\leewa\OneDrive\Desktop\EDA project\Dataset.csv"
data = pd.read_csv(file_path)

#this will display basic information about the dataset 
print(data.info())
print(data.head())

#preprocessing of the dataset 

date_columns = ['Learner SignUp DateTime', 'Opportunity Start Date', 'Opportunity End Date']
for col in date_columns:
    data[col] = pd.to_datetime(data[col], format='%d/%m/%Y', errors='coerce')

#for encoding of categorical variables 
label_encoders = {}
categorical_columns = ['Gender', 'Country', 'Institution Name', 'Current/Intended Major', 
                       'Status Description', 'Month of SignUp', 'Day of Week of SignUp', 'Season']
for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))
    label_encoders[col] = le

#this will define churn 
churn_statuses = ['Withdraw', 'Rejected', 'Dropped Out']
data['Churn'] = data['Status Description'].apply(lambda x: 1 if x in churn_statuses else 0)

#for feature engineering
features = ['Age', 'Engagement Duration', 'Gender', 'Country', 'Institution Name', 
            'Current/Intended Major', 'Month of SignUp', 'Day of Week of SignUp', 'Season']
X = data[features]
y = data['Churn']

#this will split the data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#for logistic regressiojn model

X_logit = data[features]
y_logit = data['Churn']


X_logit = add_constant(X_logit)


logit_model = Logit(y_logit, X_logit)
logit_result = logit_model.fit()


output_file_path = r"C:\Users\leewa\OneDrive\Desktop\EDA project\logistic_regression_results.txt"

with open(output_file_path, "w") as f:
    
    f.write("Logistic Regression Summary:\n")
    f.write(logit_result.summary().as_text())
    f.write("\n\n")

    
    y_pred = logit_result.predict(add_constant(X_test))
    y_pred_binary = (y_pred > 0.5).astype(int)

    
    f.write("Classification Report:\n")
    f.write(classification_report(y_test, y_pred_binary))
    f.write("\n\n")

    
    f.write("Confusion Matrix:\n")
    f.write(np.array2string(confusion_matrix(y_test, y_pred_binary), separator=', '))
    f.write("\n")

print(f"Results have been saved to {output_file_path}")

Note: you may need to restart the kernel to use updated packages.
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8547 entries, 0 to 8546
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Learner SignUp DateTime  8547 non-null   object
 1   Opportunity Name         8547 non-null   object
 2   Opportunity Category     8547 non-null   object
 3   Opportunity End Date     8547 non-null   object
 4   Gender                   8547 non-null   object
 5   Country                  8547 non-null   object
 6   Institution Name         8547 non-null   object
 7   Current/Intended Major   8547 non-null   object
 8   Status Description       8547 non-null   object
 9   Opportunity Start Date   8547 non-null   object
 10  Age                      8547 non-null   int64 
 11  Engagement Duration      8547 non-null   int64 
 12  Month of SignUp          8547 non-null   object
 13  Day of Week of SignUp    85

  return 1 - self.llf/self.llnull
