<h1 id="heading">

<a class="anchor-link" href="https://www.kaggle.com/deb009/predict-customer-churn/notebook#heading">¶</a>
</h1>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pandas_profiling import ProfileReport
import seaborn as sns
import matplotlib.pyplot as plt
from copy import deepcopy

import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
PATH = "../input/churn-risk-rate-hackerearth-ml/"
train = pd.read_csv(f"{PATH}train.csv",index_col='customer_id')
test = pd.read_csv(f"{PATH}test.csv",index_col='customer_id')
submission= pd.read_csv(f"{PATH}sample_submission.csv",index_col='customer_id')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
submission.head()

 # Data Exploration

In [None]:
train.info()

In [None]:
train.shape

In [None]:
test.info()

In [None]:
test.shape

**NULL VALUES**

It looks like there are null values in some of the columns in test and train set.
We will now try to the find the amount of null values in each dataset.

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

# Train data:

36992 rows, 25 columns

Types of columns:
    1. numerical : 6
    2. categorical : 19 
    
Missing values in 3 columns, they are:
    1. region_category
    2. preferred_offer_types
    3. points_in_wallet
    
# Test data

19919 rows, 24 columns

Types of columns:
    1. numerical : 5
    2. categorical : 19 
 
Missing values in 3 columns, they are:
    1. region_category
    2. preferred_offer_types
    3. points_in_wallet
 

In [None]:
train.nunique()

In [None]:
test.nunique()

So, we can see that Name and security_no are tottaly unique in test and train data.
We can remove them as of now.

In [None]:
train = train.drop(["security_no","Name"], axis=1)
test = test.drop(["security_no","Name"], axis=1)

In [None]:
#adding test and train set together

df =  pd.concat(objs=[train, test], axis=0).reset_index(drop=True)

In [None]:
#profile = ProfileReport(df)
#profile

# Target Variable

In [None]:
train['churn_risk_score'].value_counts()

In [None]:
sns.countplot('churn_risk_score', data=train)

According the compeition, the churn score is between 1 to 5 i.e, good to worse but here we can see that -1.
After a little searching over the internet, I found that the negative churn is actually good.

**What is negative churn?**

It is achieved when the total additional revenue generated from existing customers is greater than the revenue lost from cancellations and downgrades. When your recurring revenue grows without the addition of new customers, you’re achieving positive net revenue retention.

Simply, net negative churn is when current customers are spending so much additional money (services, upgrades, and add-ons) that your churn is offset by it. 

for more information : [https://www.profitwell.com/recur/all/negative-churn](http://)

There is a huge difference between some of the classes in the target(churn_risk_score) column.(-1,1 and 2 are quite less as compared 3,4,5).
We will handle it when creating the model

In [None]:
train.dtypes

In [None]:
#Converting the joining_date type datetime format
train['joining_date'] = train['joining_date'].astype('datetime64[ns]')
test['joining_date'] = test['joining_date'].astype('datetime64[ns]')
test['last_visit_time'] = test['last_visit_time'].astype('datetime64[ns]')
train['last_visit_time'] = train['last_visit_time'].astype('datetime64[ns]')

In [None]:
feature_cols = train.columns

## Getting all the data that are not of "object" type. 
numerical_columns = train[feature_cols].select_dtypes(include=['int64','float64']).columns
categorical_columns = train[feature_cols].select_dtypes(exclude=['int64','float64','datetime64[ns]']).columns

print(len(numerical_columns), len(categorical_columns))



# Categorical variable

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

train_0_df = train.loc[train['churn_risk_score'] == -1]
train_1_df = train.loc[train['churn_risk_score'] == 1]
train_2_df = train.loc[train['churn_risk_score'] == 2]
train_3_df = train.loc[train['churn_risk_score'] == 3]
train_4_df = train.loc[train['churn_risk_score'] == 4]
train_5_df = train.loc[train['churn_risk_score'] == 5]

num_rows, num_cols = 4,5
fig = make_subplots(rows=num_rows, cols=num_cols)

for index, column in enumerate(df[categorical_columns].columns):
    i,j = ((index // num_cols)+1, (index % num_cols)+1)
    data = train_0_df.groupby(column)[column].count().sort_values(ascending=False)
    data = data if len(data) < 10 else data[:10]
    fig.add_trace(go.Bar(
        x = data.index,
        y = data.values,
        name='Label: -1',
    ), row=i, col=j)

    data = train_1_df.groupby(column)[column].count().sort_values(ascending=False)
    data = data if len(data) < 10 else data[:10]
    fig.add_trace(go.Bar(
        x = data.index,
        y = data.values,
        name='Label: 1'
    ), row=i, col=j)
    data = train_2_df.groupby(column)[column].count().sort_values(ascending=False)
    data = data if len(data) < 10 else data[:10]
    fig.add_trace(go.Bar(
        x = data.index,
        y = data.values,
        name='Label: 2'
    ), row=i, col=j)
    data = train_3_df.groupby(column)[column].count().sort_values(ascending=False)
    data = data if len(data) < 10 else data[:10]
    fig.add_trace(go.Bar(
        x = data.index,
        y = data.values,
        name='Label: 3'
    ), row=i, col=j)
    data = train_4_df.groupby(column)[column].count().sort_values(ascending=False)
    data = data if len(data) < 10 else data[:10]
    fig.add_trace(go.Bar(
        x = data.index,
        y = data.values,
        name='Label: 4'
    ), row=i, col=j)
    data = train_5_df.groupby(column)[column].count().sort_values(ascending=False)
    data = data if len(data) < 10 else data[:10]
    fig.add_trace(go.Bar(
        x = data.index,
        y = data.values,
        name='Label: 5'
    ), row=i, col=j)
    
    fig.update_xaxes(title=column, row=i, col=j)
    fig.update_layout(barmode='stack')
    
fig.update_layout(
    autosize=False,
    width= 1600,
    height=1600,
    showlegend=False,
)
fig.show()

 # avg_frequency_login_days(Represents the no. of times a customer has logged in to the website)

It looks like avg_freq_login_days(Represents the no. of times a customer has logged in to the website) column is a numeric type column 
but it is showing up in the cat columns because we weren't able to convert it as it had a value = ERROR (count-3500 values).

ERROR value means the website was unable to register the avg_freq_login_days due to internal problem may be software glitch etc.
It will be NaNs inplace of the ERROR values.
So, we will replace it with NaN value 

In [None]:
#argument errors='coerce' converts invalid values  into NaN and the data type is float.
#We will deal with the feature engineering of this column in numeric section
train['avg_frequency_login_days'] = pd.to_numeric(train['avg_frequency_login_days'], errors='coerce')