In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score

#manage warnings
import warnings
warnings.filterwarnings('ignore')

In [9]:
# 1. upload dataset
df = pd.read_csv('telco.csv')

# Data Cleaning

# 1.1 delate Irrelevant column
cols_to_drop = ['Unnamed: 0', 'customerID']
df.drop(columns=[c for c in cols_to_drop if c in df.columns], inplace=True)

# 1.2 Handling abnormal spaces in the TotalCharges column
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# 1.3 Handing missing value
df.fillna(df.mean(numeric_only=True), inplace=True)

# 1.4 Transfer target variables
if 'Churn' in df.columns:
    df['Churn'] = df['Churn'].apply(lambda x: 1 if x == 'Churned' else 0)


# Feature Engineering

# Encoding
le = LabelEncoder()
binary_cols = [col for col in df.columns if df[col].dtype == 'object' and df[col].nunique() == 2]
for col in binary_cols:
    df[col] = le.fit_transform(df[col])

# Take One-Hot Encoding to columns with 3 and more than 3 options
cat_cols = [col for col in df.columns if df[col].dtype == 'object']
df_final = pd.get_dummies(df, columns=cat_cols)

# 2.2 Feature Scaling
scaler = StandardScaler()
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
df_final[numerical_cols] = scaler.fit_transform(df_final[numerical_cols])


# Results Export
output_filename = 'Telco_Feature_Engineered.csv'
df_final.to_csv(output_filename, index=False)