In [53]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

import os
from pathlib import Path

# Get the Downloads directory path
downloads_path = Path.home() / 'Desktop/Dataset/Capstone_1_Tmobile_Churn_data'
# Change the current working directory to Downloads
os.chdir(downloads_path)

In [54]:
# Step 2: Read the CSV file
csv_file_path = 'WA_Fn-UseC_-Telco-Customer-Churn.csv'
data = pd.read_csv(csv_file_path)

In [55]:
print(data.head())

   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies        Contract Pape

In [130]:
# Step : Change data types directly
# Specify the desired data types for specific columns
data['tenure'] = data['tenure'].astype(int)      # Change to integer
#data['TotalCharges'] = data['TotalCharges'].astype(float)    # Change to float
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce').fillna(0)

In [132]:
# Data preprocessing
# Assuming your DataFrame has 'recency', 'frequency', 'monetary', and 'clv' columns
X = data[['tenure', 'SeniorCitizen', 'MonthlyCharges']]
y = data['TotalCharges']

In [134]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [136]:
# Model selection and training
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [137]:
# Prediction
y_pred = model.predict(X_test)

In [139]:
# Evaluation
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'RMSE: {rmse}')

RMSE: 81.39747887377966




In [142]:
# Example prediction for a new customer
new_customer_data = pd.DataFrame({'tenure': [30], 'SeniorCitizen': [5], 'MonthlyCharges': [200]})
predicted_clv = model.predict(new_customer_data)
print(f'Predicted CLV for new customer: {predicted_clv[0]}')

Predicted CLV for new customer: 3205.7749999999987
