In [None]:
import pandas as pd

# Make sure the filename is correct (case-sensitive!)
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

# Preview the data
df.head()

In [None]:
df.info()
df.isnull().sum()

In [None]:
# Convert TotalCharges to float, coercing bad data to NaN
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [None]:
df['TotalCharges'].isnull().sum()

In [None]:
# Drop rows where TotalCharges couldn't be converted
df = df.dropna(subset=['TotalCharges'])

# Convert SeniorCitizen to Yes/No
df['SeniorCitizen'] = df['SeniorCitizen'].replace({1: 'Yes', 0: 'No'})

# Drop customerID (not useful for analysis)
df.drop('customerID', axis=1, inplace=True)

In [None]:
df.loc[:, 'SeniorCitizen'] = df['SeniorCitizen'].replace({1: 'Yes', 0: 'No'})
df = df.drop('customerID', axis=1)

In [None]:
df.info()

In [None]:
df['Churn'].value_counts()
df['Churn'].value_counts(normalize=True) * 100

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(x='Churn', data=df)
plt.title('Customer Churn Distribution')
plt.show()

In [None]:
sns.countplot(x='gender', hue='Churn', data=df)
plt.title('Churn by Gender')
plt.show()

sns.countplot(x='Contract', hue='Churn', data=df)
plt.title('Churn by Contract Type')
plt.show()

sns.countplot(x='InternetService', hue='Churn', data=df)
plt.title('Churn by Internet Service')
plt.show()

In [None]:
sns.boxplot(x='Churn', y='tenure', data=df)
plt.title('Tenure vs Churn')
plt.show()

sns.boxplot(x='Churn', y='MonthlyCharges', data=df)
plt.title('Monthly Charges vs Churn')
plt.show()

In [None]:
sns.countplot(x='Contract', hue='Churn', data=df)

## 📊 Exploratory Data Analysis Summary: Customer Churn

We performed exploratory data analysis to understand the patterns behind customer churn in a telecom company. Below are key findings from visualizations:

- **Contract Type**: Customers on **month-to-month** contracts have the highest churn rate, while those on one- or two-year contracts are far less likely to leave. This is one of the most significant drivers of churn.
- **Tenure**: Customers with **shorter tenure** are significantly more likely to churn. The longer a customer stays, the less likely they are to leave.
- **Monthly Charges**: Customers who churn tend to have **higher monthly charges**, suggesting that price sensitivity could be contributing to churn.
- **Internet Service Type**: Customers using **fiber optic** internet service churn more than DSL or those without internet. This may relate to pricing, satisfaction, or service reliability.
- **Gender**: No notable difference in churn rates between male and female customers. This feature may not provide predictive value.

In [None]:
# 2. Encode binary categorical columns (like 'Yes'/'No')
binary_cols = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn']
for col in binary_cols:
    if df[col].dtype == 'object':
        df[col] = df[col].map({'Yes': 1, 'No': 0})

# 3. Convert SeniorCitizen from object to int safely
df['SeniorCitizen'] = pd.to_numeric(df['SeniorCitizen'], errors='coerce').fillna(0).astype(int)

# 4. One-hot encode multi-category columns (only if not already done)
multi_cat_cols = ['InternetService', 'Contract', 'PaymentMethod', 'MultipleLines',
                  'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
                  'StreamingTV', 'StreamingMovies', 'gender']

existing_multi_cat_cols = [col for col in multi_cat_cols if col in df.columns]
df = pd.get_dummies(df, columns=existing_multi_cat_cols, drop_first=True)

# 5. Drop target column and split into features/target
X = df.drop('Churn', axis=1)
y = df['Churn']

# 6. Output checks
print("\n✅ Preprocessing complete!")
print("Shape of features (X):", X.shape)
print("Shape of target (y):", y.shape)
print("\nFirst 5 rows of processed features:")
print(X.head())



In [None]:
print(df.isnull().sum())
print(df.dtypes)

In [None]:
print(df.isnull().sum())

In [None]:
binary_cols = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'Churn']
for col in binary_cols:
    if col in df.columns and df[col].notna().sum() > 0:
        df[col] = df[col].map({'Yes': 1, 'No': 0})

In [None]:
print(df.isnull().sum())

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

In [None]:
model = LogisticRegression(max_iter=1000)
from sklearn.impute import SimpleImputer

# Create an imputer that replaces missing values with the column mean (good for numeric columns)
imputer = SimpleImputer(strategy='mean')

# Fit the imputer on the training data and transform both training and test sets
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

model.fit(X_train, y_train)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

# Separate numerical and categorical columns
numeric_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
categorical_features = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling', 'InternetService', 'Contract', 'PaymentMethod']

# Define transformers
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# Final pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])


In [None]:
model.fit(X_train, y_train)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

# Load your data
df = pd.read_csv("your_data.csv")

# --------------------------
# 🛠️ Fix 1: Convert boolean columns to string or integer
bool_cols = df.select_dtypes(include='bool').columns
df[bool_cols] = df[bool_cols].astype(str)  # or use astype(int) if preferred

# 🛠️ Fix 2: Drop columns that are entirely NaN
df = df.dropna(axis=1, how='all')

# --------------------------
# Split the data
X = df.drop("target_column", axis=1)  # replace with your actual target column name
y = df["target_column"]

# Ensure X is a DataFrame when splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --------------------------
# Define column types
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

# --------------------------
# Preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# --------------------------
# Final pipeline with classifier
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

# --------------------------
# Train the model
model.fit(X_train, y_train)

# Done! 🎉 You can now predict, score, etc.


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

# 1. Load your data
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv") 

# 2. Clean boolean columns (convert to int)
bool_cols = df.select_dtypes(include='bool').columns
df[bool_cols] = df[bool_cols].astype(int)

# 3. Drop columns with all missing values
df = df.dropna(axis=1, how='all')

# 4. Separate features and target
X = df.drop("Churn", axis=1)  # Replace with your actual target column name
y = df["Churn"]

# 5. Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 6. Select column types
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

# 7. Preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# 8. Combine preprocessors
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, num_cols),
    ('cat', categorical_transformer, cat_cols)
])

# 9. Full pipeline with Logistic Regression
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

# 10. Fit the model
model.fit(X_train, y_train)

# 11. Evaluate
accuracy = model.score(X_test, y_test)
print("Test Accuracy:", accuracy)


In [None]:
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Get predictions
y_pred = model.predict(X_test)

# Create confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Churn', 'Churn'], yticklabels=['No Churn', 'Churn'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Print report
print("Classification Report:\n")
print(classification_report(y_test, y_pred))

In [None]:
# Predict probabilities
y_pred_proba = model.predict_proba(X_test)[::,1]

# Compute ROC values
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)

# Plot ROC
plt.figure(figsize=(6,4))
plt.plot(fpr, tpr, label=f"AUC = {roc_auc_score(y_test, y_pred_proba):.2f}")
plt.plot([0,1], [0,1], linestyle='--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

In [None]:
# Convert target values to 0 (No) and 1 (Yes)
y_train = y_train.map({'No': 0, 'Yes': 1})
y_test = y_test.map({'No': 0, 'Yes': 1})
# Compute ROC values
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)

# Plot ROC
plt.figure(figsize=(6,4))
plt.plot(fpr, tpr, label=f"AUC = {roc_auc_score(y_test, y_pred_proba):.2f}")
plt.plot([0,1], [0,1], linestyle='--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()