In [1]:
#import packages

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import label_binarize
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier


import os
%matplotlib notebook

In [2]:
#set working directory
os.chdir('C:/Users/siree/OneDrive/Desktop/WGU/D209/D209_Files')

In [3]:
#getting working directory
cwd = os.getcwd()
print(cwd)

C:\Users\siree\OneDrive\Desktop\WGU\D209\D209_Files


In [4]:
#load the dataset
df = pd.read_csv('churn_clean.csv')

In [5]:
#### C3: prepare the data ######

In [6]:
irrelevant_columns = ['CaseOrder', 'Customer_id', 'Interaction', 'UID', 'City', 'State', 'County', 'Lat', 'Lng', 'Zip', 'TimeZone', 'Item1', 'Item2', 'Item3', 'Item4', 'Item5', 'Item6', 'Item7', 'Item8']
columns_to_drop = [col for col in irrelevant_columns if col in df.columns]
df.drop(columns=columns_to_drop, inplace=True)

In [7]:
#Impute missing values for numeric features with the median
for column in df.select_dtypes(include=['float64', 'int64']).columns:
    df[column].fillna(df[column].median(), inplace=True)

In [8]:
#Impute missing values for categorical features with the mode
for column in df.select_dtypes(include=['object']).columns:
    df[column].fillna(df[column].mode()[0], inplace=True)

In [9]:
categorical_vars = ['Area', 'Job', 'Children', 'Marital', 'Gender', 'Techie', 'Contract', 'Port_modem', 'Tablet', 'InternetService', 'Phone', 'Multiple', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling', 'PaymentMethod']
df = pd.get_dummies(df, columns=categorical_vars)

In [10]:
# List of binary variables to check and transform
binary_vars = ['Churn', 'Techie', 'Port_modem', 'Tablet', 'Phone', 'Multiple', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'PaperlessBilling']

In [11]:
# Define the target variable and feature set
X = df.drop('Churn_Yes', axis=1, errors='ignore') 
y = df['Churn_Yes'] if 'Churn_Yes' in df.columns else df['Churn']

In [12]:
if 'Churn_Yes' in df.columns:
    y = df['Churn_Yes']
elif 'Churn' in df.columns:
    y = df['Churn'].map({'Yes': 1, 'No': 0})

In [13]:
##### C4: CSV OF DATA SET #######

In [14]:
df.to_csv('C:/Users/siree/OneDrive/Desktop/WGU/D209/TASK_2/SIREEN_SHABAN_D209_TASK2_clean_data.csv')

In [15]:
###### D1: Split the data ########

In [16]:
#Select features and target variable
X = df.drop('Churn', axis=1)
y = df['Churn']

In [17]:
#Standardize numeric variables
numeric_vars = X.select_dtypes(include=['float64', 'int64']).columns

In [18]:
#Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
#Combine the split data for saving
train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

In [20]:
#CSV files
train_data.to_csv('C:/Users/siree/OneDrive/Desktop/WGU/D209/TASK_2/SIREEN_SHABAN_D209_TASK2_train_data.csv', index=False)
test_data.to_csv('C:/Users/siree/OneDrive/Desktop/WGU/D209/TASK_2/SIREEN_SHABAN_D209_TASK2_test_data.csv', index=False)

In [21]:
##### D2: OUTPUT AND CALCULATIONS #######

In [22]:
#Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

In [23]:
#Fit the model on the training data
rf_classifier.fit(X_train, y_train)

In [24]:
#Make predictions on the test data
y_pred = rf_classifier.predict(X_test)

In [25]:
#Classification report
class_report = classification_report(y_test, y_pred)
print(f'Classification Report:\n{class_report}')

Classification Report:
              precision    recall  f1-score   support

          No       0.89      0.96      0.92      1456
         Yes       0.85      0.67      0.75       544

    accuracy                           0.88      2000
   macro avg       0.87      0.81      0.83      2000
weighted avg       0.88      0.88      0.87      2000



In [26]:
#Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Confusion Matrix:
[[1392   64]
 [ 179  365]]


In [27]:
##### E1: ACCURACY AND MSE ######

In [28]:
# Map categorical values to numerical values
label_mapping = {'No': 0, 'Yes': 1}
y_test_numeric = y_test.map(label_mapping)
y_pred_numeric = pd.Series(y_pred).map(label_mapping)

In [29]:
#Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.8785


In [30]:
#Calculate MSE
mse = mean_squared_error(y_test_numeric, y_pred_numeric)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 0.1215
