# Customer Churn Analysis | Data analysis (EDA)

This demo is continuation of the telco pipeline after data load, clean and transformation. We will now try to explore data using snowpark and other python library. This task help us identify further transformations, importatn variable discovery and any feature engineering thats required later.

## Analysis

**We will analyse the following:**

1. The target variable
2. Variable types (categorical and numerical)
3. Missing data
4. Numerical variables
    - Discrete
    - Continuous
    - Distributions
    - Transformations

5. Categorical variables
    - Cardinality
    - Rare Labels
    - Special mappings


In [None]:
#import sys
#sys.path.append("/Applications/opt/anaconda3/envs/spk-39-ml/lib/python3.9/site-packages")
#sys.path

### Import Snowpark and Python packages

In [None]:
# Snowpark for Python
from snowflake.snowpark import Session
from snowflake.snowpark.version import VERSION
import snowflake.snowpark.functions as F
from snowflake.snowpark.types import *
from snowflake.snowpark import Session
from snowflake.snowpark import types as T
from snowflake.snowpark.functions import col

# Snowpark ML
import snowflake.ml.modeling.preprocessing as snowml
from snowflake.ml.modeling.pipeline import Pipeline
from snowflake.ml.modeling.metrics.correlation import correlation
from snowflake.ml.modeling.impute import SimpleImputer
from snowflake.ml.modeling.metrics import accuracy_score, precision_score, recall_score
from snowflake.ml.modeling.preprocessing import OneHotEncoder
from snowflake.ml.modeling.preprocessing import StandardScaler
from snowflake.ml.utils.connection_params import SnowflakeLoginOptions

from sklearn.preprocessing import StandardScaler, PolynomialFeatures

# Data Science Libs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import dython
import math

# Misc
import json
import joblib

# warning suppresion
import warnings; warnings.simplefilter('ignore')

import pandas as pd

from sklearn import linear_model

%matplotlib inline
import datetime as dt

import warnings
warnings.filterwarnings("ignore")

# lets import some tranformations functions
from snowflake.snowpark.functions import udf, col, lit, translate, is_null, iff

##  Establishing a connection to the Snowflake database using Snowpark

In [None]:

# connect to Snowflake
with open("creds.json", "r") as f:
    snowflake_conn_prop = json.load(f)  
session = Session.builder.configs(snowflake_conn_prop).create()

session.sql_simplifier_enabled = True

snowflake_environment = session.sql('SELECT current_user(), current_version()').collect()
snowpark_version = VERSION


# Current Environment Details
print('\nConnection Established with the following parameters:')
print('User                        : {}'.format(snowflake_environment[0][0]))
print('Role                        : {}'.format(session.get_current_role()))
print('Database                    : {}'.format(session.get_current_database()))
print('Schema                      : {}'.format(session.get_current_schema()))
print('Warehouse                   : {}'.format(session.get_current_warehouse()))
print('Snowflake version           : {}'.format(snowflake_environment[0][1]))
print('Snowpark for Python version : {}.{}.{}'.format(snowpark_version[0],snowpark_version[1],snowpark_version[2]))

### Create a Snowpark Dataframe

In [None]:
%%time

data_df = session.table('TRAIN_CHURN_DATASET_BIN')
#data = raw.toPandas()

### Check for the Column names in the data set

In [None]:
data_df.columns

### Run a SQL statement within a session

In [None]:
session.sql("SELECT * FROM TRAIN_CHURN_DATASET_BIN LIMIT 10;").collect()

In [None]:
data_df.show()

### Number of records in the dataset

In [None]:
data_df.count()

## Check Target Distribution

Let's begin by exploring the target distribution.

In [None]:
# We'll print the target variable, target names, and frequency of each unique value:

data_pd = data_df.to_pandas()

(unique, counts) = np.unique(data_pd['CHURNVALUE'], return_counts=True)

print('Unique values of target variable', unique)
print('Counts of target variable', counts)

In [None]:
sns.barplot(x=unique, y=counts)
plt.title('Target variable counts in dataset')
plt.show()

In [None]:

# collect some basic info on our features
data_pd.info()

In [None]:
# get household ids from dataframe
custid_data = data_pd[['CUSTOMERID']]

# remove household ids from dataframe
features_data = data_pd.drop(['CUSTOMERID'], axis=1)

features_data

In [None]:
features_data.describe()

### 2.2 Feature scalling
#### There are three numerical columns with the following distributions:

In [None]:
def distplot(feature, frame, color='g'):
    plt.figure(figsize=(8,3))
    plt.title("Distribution for {}".format(feature))
    ax = sns.distplot(frame[feature], color= color)

numerical_cols = ['TENUREMONTHS', 'MONTHLYCHARGES', 'TOTALCHARGES']
features_data[numerical_cols].describe()

### Distribution of Numerical Variables

In [None]:
for feat in numerical_cols: distplot(feat, features_data)

### Distribution of Categorical Variables

In [None]:
# Define the categorical variables
categorical_vars = [
    'PHONESERVICE_YES', 'MULTIPLELINES_NO_PHONE_SERVICE', 'MULTIPLELINES_YES',
    'INTERNETSERVICE_FIBER_OPTIC', 'INTERNETSERVICE_NO',
    'ONLINESECURITY_NO_INTERNET_SERVICE', 'ONLINESECURITY_YES',
    'ONLINEBACKUP_NO_INTERNET_SERVICE', 'ONLINEBACKUP_YES',
    'DEVICEPROTECTION_NO_INTERNET_SERVICE', 'DEVICEPROTECTION_YES',
    'TECHSUPPORT_NO_INTERNET_SERVICE', 'TECHSUPPORT_YES',
    'STREAMINGTV_NO_INTERNET_SERVICE', 'STREAMINGTV_YES',
    'STREAMINGMOVIES_NO_INTERNET_SERVICE', 'STREAMINGMOVIES_YES',
    'CONTRACT_ONE_YEAR', 'CONTRACT_TWO_YEAR', 'PAPERLESSBILLING_YES',
    'PAYMENTMETHOD_CREDIT_CARD', 'PAYMENTMETHOD_ELECTRONIC_CHECK',
    'PAYMENTMETHOD_MAILED_CHECK', 'MONTHLYCHARGESBIN_LOW',
    'MONTHLYCHARGESBIN_MEDIUM', 'TOTALCHARGESBIN_LOW',
    'TOTALCHARGESBIN_MEDIUM'
]

In [None]:
# Define the layout for subplots
num_cols = 4
num_rows = (len(categorical_vars) + num_cols - 1) // num_cols

# Create subplots
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 20))

# Plotting the countplot for each categorical variable
for i, var in enumerate(categorical_vars):
    row = i // num_cols
    col = i % num_cols
    sns.countplot(data=features_data, x=var, ax=axes[row, col])
    axes[row, col].set_title(f'{var}')
    axes[row, col].set_xticklabels(axes[row, col].get_xticklabels(), rotation=45)

# Adjust layout
plt.tight_layout()
plt.show()

In [None]:
data_df.show(10)

## Off to ~02 notebook for model deployment


In [None]:
session.close()