<a href="https://colab.research.google.com/github/read-my-name/data-analysis/blob/main/Telecom_Customer_Churn_Analysis_and_Prediction_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'public-telecom-customer-churn-analysis-and-prediction:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F75201%2F8258007%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240421%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240421T032039Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D094db0c22077665d1db358fe6a8d36e7ab14c9830b13ecf824e77cb5c9dc310dd1a1bbb76dff5ef908c623be0f92d00c52ab8f26fbf9f77283e84171e8715a2d0c633b9cfe5b4a729ccf3b1e5879f9af287d4f7fb2c666a3b112a03145dc1e2d19fd7947796c1ed3f272604a7f9e00b2cd559de95e91717bf2c9336ebd280fe185086a780e6e0454e1ebca41d542fa28aa9262164ea7923651926fce353b6456e8e61ccc04213147ce83c28b224b853ee0cc4cdd25e4f208a32ff0b7f2401130ab0b67215f00fc788041fc56aa991bc3e18a2bba668630788c829147917981085883cd5d550681c0ae9f20693d7541b037e6e7cd503fc924acd752a48487171e'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# import required library
#Data Structures
import pandas as pd
import numpy as np
import re
import os

### For installing missingno library, type this command in terminal
#pip install missingno

import missingno as msno

#Sklearn
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder

#Plotting
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

In [None]:
# Step 1: Data Loading and Initial Exploration

df = pd.read_csv("/kaggle/input/public-telecom-customer-churn-analysis-and-prediction/train_data.csv")
train_data = pd.read_csv("/kaggle/input/public-telecom-customer-churn-analysis-and-prediction/test_data.csv")

In [None]:
df.info()

In [None]:
df.sample(4)

In [None]:
# customer id is unique id
df['CustomerID'].nunique()

In [None]:
x = df.drop(['CustomerID', 'Surname'],axis=1).iloc[:,:-1]

y = df.iloc[:,-1]

x.shape, y.shape

In [None]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Apply label encoding to categorical columns
x['Region'] = label_encoder.fit_transform(x['Region'])
x['Gender'] = label_encoder.fit_transform(x['Gender'])

# Print data types to verify the changes
x.dtypes

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [None]:
x_train.head()

In [None]:
msno.bar(x_train)

In [None]:
missing_data_percent = 100*x_train.isnull().sum()/len(y_train)
missing_data_percent

In [None]:
x_train_filtered = x_train
x_train_filtered.shape

In [None]:
# Impute missing values with mean for numerical columns
x_train_filtered['NetworkScore'].fillna(x_train_filtered['NetworkScore'].mean(), inplace=True)
x_train_filtered['Age'].fillna(x_train_filtered['Age'].mean(), inplace=True)
x_train_filtered['EstimatedMonthlyUsage'].fillna(x_train_filtered['EstimatedMonthlyUsage'].mean(), inplace=True)

# Set a constant value for missing values in 'IsActiveMember' column
x_train_filtered['IsActiveMember'].fillna(0, inplace=True)  # Assuming 0 represents 'NoActive'

In [None]:
msno.bar(x_train_filtered)

In [None]:
plt.figure(figsize=(15,8))
plt.xticks(rotation=45)
sns.boxplot(data = x_train_filtered)

In [None]:
x_train_filtered.info()

In [None]:
# Select columns that are not numeric
non_numeric_columns = x_train_filtered.select_dtypes(exclude=['number']).columns.tolist()

# Display the list of non-numeric columns
print("Non-numeric columns:", non_numeric_columns)

In [None]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Apply label encoding to categorical columns
x_train_filtered['Region'] = label_encoder.fit_transform(x_train_filtered['Region'])
x_train_filtered['Gender'] = label_encoder.fit_transform(x_train_filtered['Gender'])

# Print data types to verify the changes
x_train_filtered.dtypes

In [None]:
scale = StandardScaler()
x_train_filtered1 = scale.fit_transform(x_train_filtered)

In [None]:
# Extract column names from the DataFrame
new_vars = list(x_train_filtered.columns)

# Print the list of column names (labels)
print(new_vars)

In [None]:
plt.figure(figsize=(15,8))
plt.xticks(rotation=45)
sns.boxplot(data = pd.DataFrame(x_train_filtered1, columns=new_vars))

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(pd.DataFrame(x_train_filtered1, columns=new_vars).corr())

In [None]:
#Distribution for the churn probability
sns.histplot(y_train)

In [None]:
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
rf.fit(x_train_filtered1, y_train)

In [None]:
# Predictions on the test data
y_pred = rf.predict(x_test)

# Evaluate the model
accuracy = metrics.accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
f1_score = metrics.f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)

In [None]:
print('Gradient Boosting Classifier:  {:.2f}'.format(metrics.accuracy_score(y, stratified_cv(X, y, ensemble.GradientBoostingClassifier))))
print('Support vector machine(SVM):   {:.2f}'.format(metrics.accuracy_score(y, stratified_cv(X, y, svm.SVC))))
print('Random Forest Classifier:      {:.2f}'.format(metrics.accuracy_score(y, stratified_cv(X, y, ensemble.RandomForestClassifier))))
print('K Nearest Neighbor Classifier: {:.2f}'.format(metrics.accuracy_score(y, stratified_cv(X, y, neighbors.KNeighborsClassifier))))
print('Logistic Regression:           {:.2f}'.format(metrics.accuracy_score(y, stratified_cv(X, y, linear_model.LogisticRegression))))