In [None]:
pip install pandas numpy seaborn scikit-learn imbalanced-learn matplotlib


In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv("telecom_customer_churn.csv")

# Inspect the first few rows
print(df.head())


In [None]:
# Identify categorical and numerical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

print("Categorical Columns:", categorical_cols)
print("Numerical Columns:", numerical_cols)

# Target variable
target = "Churn"  # 'Yes' or 'No'


In [None]:
# Check for missing data
print(df.isnull().sum())


In [None]:
# Fill missing numerical columns with median
for col in numerical_cols:
    df[col].fillna(df[col].median(), inplace=True)

# Fill missing categorical columns with mode
for col in categorical_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)


In [None]:
from sklearn.preprocessing import LabelEncoder

# Label Encoding for binary categorical features
label_encoder = LabelEncoder()
binary_cols = [col for col in categorical_cols if df[col].nunique() == 2 and col != target]

for col in binary_cols:
    df[col] = label_encoder.fit_transform(df[col])

# One-Hot Encoding for remaining categorical columns
df = pd.get_dummies(df, columns=[col for col in categorical_cols if col not in binary_cols + [target]], drop_first=True)

# Encode target variable
df[target] = label_encoder.fit_transform(df[target])  # Yes → 1, No → 0


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Histogram for numerical columns
df[numerical_cols].hist(bins=20, figsize=(14, 6))
plt.tight_layout()
plt.show()

# Count plot for target
sns.countplot(x=target, data=df)
plt.title("Churn Distribution")
plt.show()


In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

X = df.drop(columns=[target])
y = df[target]

# Select top 10 features
selector = SelectKBest(score_func=f_classif, k=10)
X_kbest = selector.fit_transform(X, y)

print("Shape after SelectKBest:", X_kbest.shape)


In [None]:
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(random_state=42)
rfe = RFE(model, n_features_to_select=10)
rfe.fit(X, y)

# Print selected features
selected_features = X.columns[rfe.support_]
print("Top features from RFE:", selected_features.tolist())


In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Standardize before PCA
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# PCA to 2 components
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Visualize PCA result
plt.figure(figsize=(8, 6))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=y, palette='coolwarm')
plt.title("PCA: Customer Churn")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.show()


In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

lda = LDA(n_components=1)
X_lda = lda.fit_transform(X_scaled, y)

# Visualize LDA result
plt.figure(figsize=(8, 4))
sns.histplot(X_lda.ravel(), hue=y, bins=30, kde=True, palette="coolwarm", element="step")
plt.title("LDA: Churn Separability")
plt.xlabel("LDA Component")
plt.show()


In [None]:
from imblearn.over_sampling import SMOTE

print("Before SMOTE:\n", y.value_counts())

smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X_scaled, y)

print("After SMOTE:\n", pd.Series(y_balanced).value_counts())


In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_scaled_final = scaler.fit_transform(X_balanced)


In [None]:
# Convert to DataFrame (if needed)
final_df = pd.DataFrame(X_scaled_final, columns=X.columns)
final_df["Churn"] = y_balanced

# Save to CSV
final_df.to_csv("processed_churn_dataset.csv", index=False)
print("✅ Final dataset saved as 'processed_churn_dataset.csv'")
