In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('dataset/dataset.csv')
data.head()

In [None]:
data.shape

In [None]:
data.tail(3)

In [None]:
data[-1:]

In [None]:
data[:1]

In [None]:
data.describe()

In [None]:
data.columns

In [None]:
data.info()

In [None]:
# Converting the churn column into numerical label
data['churn'] = data['churn'].astype('int64')

In [None]:
data.head()

In [None]:
# Get description of non numerical features = objects and bool
data.describe(include=['object','bool'])

In [None]:
data['churn'].value_counts()

In [None]:
data['churn'].value_counts(normalize=True)

### Sorting the dataframe

In [None]:
data.sort_values(by=['state'], axis = 0, ascending=[1]).head()

In [None]:
data.sort_values(by=['total day charge'], ascending=[0], axis = 0).head()

### Indexing and retriving columns

In [None]:
data['total day charge'].mean()

In [None]:
data['churn'].mean()

In [None]:
# Get information from boolean indexing
data[data['churn'] == 1].mean()

In [None]:
# How much time churned users spend on day time
data[data['churn']== 1]['total day minutes'].mean()

In [None]:
data[data['churn']== 0]['total day minutes'].mean()

In [None]:
## Maximum length of international call for non churned users who do not have int plan
data[(data['churn'] == 0) & (data['international plan'] == 'no')]['total intl minutes'].max()

In [None]:
## Data indexing iloc = by number loc = by string
data[0:3:]

In [None]:
data.loc[0:3,'state':'phone number']

In [None]:
data.iloc[0:3, 0:2]

### Applying functions to the dataframe using .apply and labmbda

In [None]:
data.apply(np.max)

In [None]:
## Apply can be applied to all the rows of dataset 
data[data['state'].apply(lambda state: state[0] == 'W')].head()

In [None]:
## You can also replace the data by passing the dictionary
d = {'no': False, 'yes': True}
data['international plan'] = data['international plan'].map(d)

In [None]:
data.head()

In [None]:
data.replace({'voice mail plan': d})
data.head()

## Let's do some grouping

In general, grouping data in Pandas goes as follows:

            df.groupby(by=grouping_columns)[columns_to_show].function()

First, the groupby method divides the grouping_columns by their values. They become a new index in the resulting dataframe.

Then, columns of interest are selected (columns_to_show). If columns_to_show is not included, all non groupby clauses will be included.

Finally, one or several functions are applied to the obtained groups per selected columns.
Here is an example where we group the data according to the values of the Churn variable and display statistics of three columns in each group:

In [None]:
columns_to_show = ['total intl minutes', 'total day minutes', 'total eve minutes']
data.groupby(by=['churn'])[columns_to_show].describe(percentiles=[])

In [None]:
data.groupby(['churn'])[columns_to_show].agg([np.mean, np.std, np.min, 
                                            np.max])

## Summary tables

Suppose we want to see how the observations in our sample are distributed in the context of two variables - Churn and International plan. To do so, we can build a contingency table using the crosstab method:

In [None]:
pd.crosstab(data['churn'],data['international plan'])

In [None]:
pd.crosstab(data['churn'],data['voice mail plan'], normalize=True)

In [None]:
# Constructing a pivot table
data.pivot_table(['total intl minutes', 'total eve minutes', 'total night minutes'],['area code'], aggfunc='mean' )

### Visualizing the dataset

In [None]:
import seaborn as sns


In [None]:
## Lets see how international plan is related to churn using contingency table and visualization
pd.crosstab(data['churn'], data['international plan'], margins=True)

In [None]:
sns.countplot(x='international plan', hue='churn', data=data)

We can see that with the international plan the churn rate is much higher this might be because of loosely controlled
international plans.
Now let's see if the customer service call is related to the churn rate

In [None]:
sns.countplot(x='customer service calls', hue='churn', data=data)

So we can see that starting from the 4 calls the churn increases dramatically which is clear from the diagram