In [3]:
# Import required libraries
import pandas as pd
from textblob import TextBlob

In [5]:
## STEP 1: LOAD THE DATA
# Load customer data
customers = pd.read_csv("Customer_Data.csv")

# Load T-Mobile review data (scraped)
reviews = pd.read_csv("tmobile_reviews.csv")

In [6]:
##STEP 2: CLEAN CUSTOMER DATA
# Convert 'TotalCharges' to numeric (some values may be empty strings)
customers['TotalCharges'] = pd.to_numeric(customers['TotalCharges'], errors='coerce')

# Drop rows where TotalCharges couldn't be converted (missing data)
customers.dropna(subset=['TotalCharges'], inplace=True)

# Reset index after dropping rows
customers.reset_index(drop=True, inplace=True)

In [7]:
##  STEP 3: CREATE CLV TARGET COLUMN
# CLV proxy: Tenure * MonthlyCharges
# Assumes CLV = revenue over time without churn discounting
customers['CLV'] = customers['tenure'] * customers['MonthlyCharges']

In [8]:
##STEP 4: ANALYZE CUSTOMER REVIEW TEXT
# Extract sentiment score (polarity from -1 to +1) using TextBlob
reviews['sentiment'] = reviews['review_text'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)

# Compute aggregate sentiment features
avg_sentiment = reviews['sentiment'].mean()
min_sentiment = reviews['sentiment'].min()
max_sentiment = reviews['sentiment'].max()

In [9]:
##  STEP 5: COMBINE SENTIMENT STATS WITH CUSTOMER DATA
# Add sentiment scores as new columns (applies same to all customers)
customers['avg_review_sentiment'] = avg_sentiment
customers['min_review_sentiment'] = min_sentiment
customers['max_review_sentiment'] = max_sentiment

In [10]:
# ===============================
# STEP 6: SAVE COMBINED DATA FILE
# ===============================

# Save the final dataset for EDA and model training
customers.to_csv("customer_clv_dataset.csv", index=False)

#  print confirmation
print("Combined dataset saved as 'customer_clv_dataset.csv'")

Combined dataset saved as 'customer_clv_dataset.csv'


In [12]:
## view the csv data 
import pandas as pd

# Load the combined CSV file
df = pd.read_csv("customer_clv_dataset.csv")

# Show the first 5 rows
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,CLV,avg_review_sentiment,min_review_sentiment,max_review_sentiment
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,Month-to-month,Yes,Electronic check,29.85,29.85,No,29.85,0.03297,-0.6,0.8
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,One year,No,Mailed check,56.95,1889.5,No,1936.3,0.03297,-0.6,0.8
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,107.7,0.03297,-0.6,0.8
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,One year,No,Bank transfer (automatic),42.3,1840.75,No,1903.5,0.03297,-0.6,0.8
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,141.4,0.03297,-0.6,0.8


In [14]:
## view all rows and column
import pandas as pd

# Load the CSV file
df = pd.read_csv("customer_clv_dataset.csv")

# Set Pandas to display all rows and columns
pd.set_option('display.max_rows', None)      # Show all rows
pd.set_option('display.max_columns', None)   # Show all columns
pd.set_option('display.width', None)         # Prevent line wrapping
pd.set_option('display.max_colwidth', None)  # Show full content in columns

# Now display 100 rows
# Show a random sample of 100 rows
df.sample(100)

# OR view first 100 rows
df.head(100)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,CLV,avg_review_sentiment,min_review_sentiment,max_review_sentiment
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No,29.85,0.03297,-0.6,0.8
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No,1936.3,0.03297,-0.6,0.8
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,107.7,0.03297,-0.6,0.8
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No,1903.5,0.03297,-0.6,0.8
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,141.4,0.03297,-0.6,0.8
5,9305-CDSKC,Female,0,No,No,8,Yes,Yes,Fiber optic,No,No,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,Yes,797.2,0.03297,-0.6,0.8
6,1452-KIOVK,Male,0,No,Yes,22,Yes,Yes,Fiber optic,No,Yes,No,No,Yes,No,Month-to-month,Yes,Credit card (automatic),89.1,1949.4,No,1960.2,0.03297,-0.6,0.8
7,6713-OKOMC,Female,0,No,No,10,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,No,Mailed check,29.75,301.9,No,297.5,0.03297,-0.6,0.8
8,7892-POOKP,Female,0,Yes,No,28,Yes,Yes,Fiber optic,No,No,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes,2934.4,0.03297,-0.6,0.8
9,6388-TABGU,Male,0,No,Yes,62,Yes,No,DSL,Yes,Yes,No,No,No,No,One year,No,Bank transfer (automatic),56.15,3487.95,No,3481.3,0.03297,-0.6,0.8


## Why Are We Only Adding Sentiment and Not the Full Review Text?
->The tmobile_reviews.csv file does not contain any identifiers like customerID, email, or phone number — so:

You can’t match a review to a specific customer.

You can’t join (merge) them on a unique key.

That means we cannot assign each review to a specific customer.

--> Reason 2: Review Text is Unstructured
Machine learning models like regression or XGBoost can't directly process raw text like:

"T-Mobile service dropped my call again, I’m so frustrated."

So we convert this to numeric sentiment scores using NLP tools like TextBlob, VADER, or transformer embeddings — this turns emotional tone into data we can use for modeling (like: -0.6 = negative).

## What We Can Do With Review Text (Later):

Option 1: Use Text as NLP Feature
Use BERT or TF-IDF to convert text to vectors.

Combine with tabular data in a neural network (complex setup).

Option 2: Cluster Reviews
Cluster reviews into topics using LDA or KMeans.

Add "Topic 1", "Topic 2" flags as features to customer data (but still no link to a specific customer).

## Why Sentiment Stats Work Well for Now
Because:

They are numeric

Easy to broadcast to every customer (as global perception)

Still provide a valuable signal (e.g., overall negativity = more churn).