<a href="https://colab.research.google.com/github/prince545/exploratory-data-analysis/blob/main/KNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Step 2: Load dataset
# If using Google Colab, you can mount your Google Drive
# from google.colab import drive
# drive.mount('/content/drive')
# df = pd.read_csv('/content/drive/MyDrive/path_to/WA_Fn-UseC_-Telco-Customer-Churn.csv')

# If using local file
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

# Step 3: Initial Inspection
print(df.head())
print(df.info())

# Step 4: Convert 'TotalCharges' to numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Step 5: Drop missing values
df.dropna(inplace=True)

# Step 6: Define target and features
X = df.drop(['customerID', 'Churn'], axis=1)
y = df['Churn'].values

# Step 7: Convert categorical variables using dummy encoding
X = pd.get_dummies(X, drop_first=True)

# Step 8: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

# Step 9: Feature scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Step 10: Check results
print("Training features shape:", X_train.shape)
print("Testing features shape:", X_test.shape)
print("Training labels shape:", y_train.shape)
print("Testing labels shape:", y_test.shape)


   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies        Contract Pape

In [2]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 2: Load the dataset
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

# Step 3: Data preprocessing
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df.dropna(inplace=True)
X = df.drop(['customerID', 'Churn'], axis=1)
y = df['Churn'].values

# One-hot encoding for categorical variables
X = pd.get_dummies(X, drop_first=True)

# Step 4: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Step 5: Feature scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Step 6: Build and train the KNN model
model = KNeighborsClassifier(n_neighbors=5)  # You can change to n_neighbors=7, weights='distance', etc.
model.fit(X_train, y_train)

# Step 7: Make predictions
y_pred = model.predict(X_test)

# Step 8: Evaluate the model
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nAccuracy Score: {:.2f}%".format(accuracy_score(y_test, y_pred) * 100))


Confusion Matrix:
 [[1097  203]
 [ 230  228]]

Classification Report:
               precision    recall  f1-score   support

          No       0.83      0.84      0.84      1300
         Yes       0.53      0.50      0.51       458

    accuracy                           0.75      1758
   macro avg       0.68      0.67      0.67      1758
weighted avg       0.75      0.75      0.75      1758


Accuracy Score: 75.37%


In [3]:
# Step 9: Predicting on new unseen data (simulate from one test example)
new_data_raw = X_test[0].reshape(1, -1)  # Just taking first row as example
new_prediction = model.predict(new_data_raw)
new_probability = model.predict_proba(new_data_raw)

print("Prediction:", new_prediction[0])  # 'Yes' or 'No'
print("Probability of Churn [No, Yes]:", new_probability[0])


Prediction: No
Probability of Churn [No, Yes]: [1. 0.]


In [4]:
# Step 10: Manual new input (must match number of features in X)
manual_input = np.array([[0, 1, 70.35, 1397.475, 0, 1, 0, 1, 1, 0, 1, 0,
                          1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1]])
manual_input_scaled = sc.transform(manual_input)

manual_pred = model.predict(manual_input_scaled)
manual_proba = model.predict_proba(manual_input_scaled)

print("New Customer Prediction:", manual_pred[0])
print("Churn Probabilities [No, Yes]:", manual_proba[0])


New Customer Prediction: No
Churn Probabilities [No, Yes]: [0.8 0.2]


