## Import Dataset 

In [32]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [33]:
dataset = pd.read_excel('customer_churn_large_dataset.xlsx')
dataset.head()


Unnamed: 0,CustomerID,Name,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
0,1,Customer_1,63,Male,Los Angeles,17,73.36,236,0
1,2,Customer_2,62,Female,New York,1,48.76,172,0
2,3,Customer_3,24,Female,Los Angeles,5,85.47,460,0
3,4,Customer_4,36,Female,Miami,3,97.94,297,1
4,5,Customer_5,46,Female,Miami,19,58.14,266,0


In [34]:
# convert xlsx file into csv
dataset.to_csv("dataset.csv", index=False)

In [35]:
df = pd.read_csv("dataset.csv")
df.head()

Unnamed: 0,CustomerID,Name,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
0,1,Customer_1,63,Male,Los Angeles,17,73.36,236,0
1,2,Customer_2,62,Female,New York,1,48.76,172,0
2,3,Customer_3,24,Female,Los Angeles,5,85.47,460,0
3,4,Customer_4,36,Female,Miami,3,97.94,297,1
4,5,Customer_5,46,Female,Miami,19,58.14,266,0


In [36]:
# data (rows, columns)
df.shape

(100000, 9)

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   CustomerID                  100000 non-null  int64  
 1   Name                        100000 non-null  object 
 2   Age                         100000 non-null  int64  
 3   Gender                      100000 non-null  object 
 4   Location                    100000 non-null  object 
 5   Subscription_Length_Months  100000 non-null  int64  
 6   Monthly_Bill                100000 non-null  float64
 7   Total_Usage_GB              100000 non-null  int64  
 8   Churn                       100000 non-null  int64  
dtypes: float64(1), int64(5), object(3)
memory usage: 6.9+ MB


### We don't have null data in any column and three columns (Name, Gender, and Location) are objects.
*Remove CustomerID and Name columns because it don't help us in prediction so we don't need its,*

In [38]:
df = df.drop(['CustomerID', 'Name'], axis=1)
df.head()

Unnamed: 0,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
0,63,Male,Los Angeles,17,73.36,236,0
1,62,Female,New York,1,48.76,172,0
2,24,Female,Los Angeles,5,85.47,460,0
3,36,Female,Miami,3,97.94,297,1
4,46,Female,Miami,19,58.14,266,0


In [39]:
df.columns

Index(['Age', 'Gender', 'Location', 'Subscription_Length_Months',
       'Monthly_Bill', 'Total_Usage_GB', 'Churn'],
      dtype='object')

In [40]:
df['Gender'].value_counts()

Gender
Female    50216
Male      49784
Name: count, dtype: int64

In [41]:
df['Location'].value_counts()

Location
Houston        20157
Los Angeles    20041
Miami          20031
Chicago        19958
New York       19813
Name: count, dtype: int64

In [42]:
df['Monthly_Bill'].value_counts()

Monthly_Bill
84.37     34
61.99     29
92.46     28
66.38     28
38.26     28
          ..
55.26      4
64.01      3
100.00     3
36.55      3
91.04      3
Name: count, Length: 7001, dtype: int64

In [43]:
df['Subscription_Length_Months'].value_counts()

Subscription_Length_Months
20    4303
22    4267
1     4247
16    4229
2     4228
14    4213
7     4211
11    4200
6     4184
18    4171
5     4171
12    4155
21    4154
13    4154
10    4151
3     4136
9     4134
15    4122
24    4113
19    4106
8     4106
4     4098
23    4083
17    4064
Name: count, dtype: int64

In [44]:
df['Churn'].value_counts()

Churn
0    50221
1    49779
Name: count, dtype: int64

* Female more than male 
* Maximum Customore belong to Houstom
* allmost 50% coustomer Churn

In [45]:
# Group by Churn and calculate values of Location within each group
grouped_data = df.groupby('Location')['Churn'].value_counts()
grouped_data


Location     Churn
Chicago      0        10013
             1         9945
Houston      0        10258
             1         9899
Los Angeles  0        10161
             1         9880
Miami        1        10076
             0         9955
New York     1         9979
             0         9834
Name: count, dtype: int64

#### New York and Miami churn people more than other 

In [46]:
grouped_data = df.groupby('Gender')['Churn'].value_counts()
grouped_data

Gender  Churn
Female  0        25272
        1        24944
Male    0        24949
        1        24835
Name: count, dtype: int64

In [47]:
grouped_data = df.groupby('Total_Usage_GB')['Churn'].value_counts()
grouped_data

Total_Usage_GB  Churn
50              0        132
                1        121
51              0        106
                1        105
52              1        143
                        ... 
498             1        102
499             1        120
                0        116
500             0        139
                1        106
Name: count, Length: 902, dtype: int64

In [48]:

# Calculate the correlation matrix between column1 and column2
df.groupby('Monthly_Bill')['Churn'].value_counts()

Monthly_Bill  Churn
30.00         0        3
              1        2
30.01         1        9
              0        8
30.02         0        4
                      ..
99.98         0        6
99.99         1        5
              0        4
100.00        0        2
              1        1
Name: count, Length: 13988, dtype: int64

## Encoding
*There are two columns (Male and Location) have categorical values*
### using One Hot Encoding

In [49]:
# gender = df.Gender
# gender = pd.get_dummies(gender, drop_first=True, dtype=int)
# location = pd.get_dummies(df.Location, drop_first=True, dtype=int)
# df = df.drop(['Gender', 'Location'], axis=1)
# df = pd.concat([df, gender, location], axis=1)
# df.head()


### Split feature and label

In [50]:
# # feature 
# X = df.drop('Churn', axis=1)

# # label
# y = df['Churn']

In [51]:
# X.head()

In [52]:
y.head()

0    0
1    0
2    0
3    1
4    0
Name: Churn, dtype: int64

## Split data into Training and Testing

In [53]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [54]:
X_train.shape

(80000, 9)

## Pipeline with encoding and feaure scaling
### encoding and feature scaling

In [55]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [61]:
# define preprocessing
categorical_feature = np.where(np.dtypes(df.columns) == object)

TypeError: 'module' object is not callable

### Create Mode

In [None]:
from sklearn.linear_model import LogisticRegression