# Auto Insurance Data Cleaning

Data is currently stored in 4 separate CSV files and needs to be compiled into one. The end goal is to use the data with both BI software to create a dashboard as well as creating a model to make predictions, so transformations done should keep the data in a form that is flexible enough for both applications.

In [1]:
# Imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

## Load in Data

In [2]:
# Read in address.csv as a dataframe
address_df = pd.read_csv('Data/address.csv')

address_df.head(10)

Unnamed: 0,ADDRESS_ID,LATITUDE,LONGITUDE,STREET_ADDRESS,CITY,STATE,COUNTY
0,521301100000.0,32.315803,-96.627896,8457 Wright Mountains Apt. 377,Ennis,TX,Ellis
1,521300000000.0,,,082 Cline Mountains Apt. 353,Irving,TX,Dallas
2,521300200000.0,32.80629,-96.779857,457 John Mills,Dallas,TX,Dallas
3,521301300000.0,32.825737,-96.939687,5726 Barnett Meadow,Irving,TX,Dallas
4,521301000000.0,32.867192,-96.715552,050 Nicholas Views,Dallas,TX,Dallas
5,521301500000.0,33.055527,-96.705288,207 Rebecca Brook,Plano,TX,Collin
6,521300600000.0,33.406005,-96.966034,9983 Jesse Landing,Pilot Point,TX,Denton
7,521300900000.0,32.892217,-97.083184,76627 Waters Estate Apt. 016,Grapevine,TX,Tarrant
8,521300100000.0,32.858974,-96.649463,378 Anderson Manors Suite 859,Dallas,TX,Dallas
9,521301400000.0,32.982515,-96.575038,12710 Vanessa Rest,Sachse,TX,Dallas


In [3]:
# Check information for address_df
address_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1536673 entries, 0 to 1536672
Data columns (total 7 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   ADDRESS_ID      1536673 non-null  float64
 1   LATITUDE        1406712 non-null  float64
 2   LONGITUDE       1406712 non-null  float64
 3   STREET_ADDRESS  1536673 non-null  object 
 4   CITY            1521239 non-null  object 
 5   STATE           1536673 non-null  object 
 6   COUNTY          1521239 non-null  object 
dtypes: float64(3), object(4)
memory usage: 82.1+ MB


In [4]:
# Check null values
address_df.isna().sum()

ADDRESS_ID             0
LATITUDE          129961
LONGITUDE         129961
STREET_ADDRESS         0
CITY               15434
STATE                  0
COUNTY             15434
dtype: int64

In [5]:
# Read in customer.csv as a dataframe
customer_df = pd.read_csv('Data/customer.csv')

customer_df.head(10)

Unnamed: 0,INDIVIDUAL_ID,ADDRESS_ID,CURR_ANN_AMT,DAYS_TENURE,CUST_ORIG_DATE,AGE_IN_YEARS,DATE_OF_BIRTH,SOCIAL_SECURITY_NUMBER
0,221300000000.0,521300000000.0,818.877997,1454.0,2018-12-09,44.474,1978-06-23,608-XX-7640
1,221300100000.0,521300100000.0,974.199182,1795.0,2018-01-02,72.559,1950-05-30,342-XX-6908
2,221300700000.0,521300200000.0,967.375112,4818.0,2009-09-23,55.444,1967-07-07,240-XX-9224
3,221301600000.0,521300600000.0,992.409561,130.0,2022-07-25,53.558,1969-05-25,775-XX-6249
4,221301600000.0,521300600000.0,784.633494,5896.0,2006-10-11,50.22,1972-09-25,629-XX-7298
5,221302700000.0,521301000000.0,909.916163,484.0,2021-08-05,32.641,1990-04-20,194-XX-6050
6,221300700000.0,521300300000.0,1084.048271,2896.0,2014-12-28,70.806,1952-02-29,653-XX-7262
7,221302200000.0,521300800000.0,1277.370767,4096.0,2011-09-15,45.969,1976-12-24,163-XX-1670
8,221302900000.0,521301100000.0,917.201036,2107.0,2017-02-24,56.389,1966-07-27,621-XX-5572
9,221301200000.0,521300500000.0,1095.390747,3387.0,2013-08-24,64.307,1958-08-28,793-XX-1178


In [6]:
# Check information for customer_df
customer_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2280321 entries, 0 to 2280320
Data columns (total 8 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   INDIVIDUAL_ID           float64
 1   ADDRESS_ID              float64
 2   CURR_ANN_AMT            float64
 3   DAYS_TENURE             float64
 4   CUST_ORIG_DATE          object 
 5   AGE_IN_YEARS            float64
 6   DATE_OF_BIRTH           object 
 7   SOCIAL_SECURITY_NUMBER  object 
dtypes: float64(5), object(3)
memory usage: 139.2+ MB


In [7]:
# Check null values
customer_df.isna().sum()

INDIVIDUAL_ID                  1
ADDRESS_ID                     1
CURR_ANN_AMT                   0
DAYS_TENURE                    0
CUST_ORIG_DATE                 0
AGE_IN_YEARS              167766
DATE_OF_BIRTH                  0
SOCIAL_SECURITY_NUMBER         0
dtype: int64

In [8]:
# Read in demographic.csv as a dataframe
demographic_df = pd.read_csv('Data/demographic.csv')

demographic_df.head(10)

Unnamed: 0,INDIVIDUAL_ID,INCOME,HAS_CHILDREN,LENGTH_OF_RESIDENCE,MARITAL_STATUS,HOME_MARKET_VALUE,HOME_OWNER,COLLEGE_DEGREE,GOOD_CREDIT
0,221302800000.0,125000.0,1.0,8.0,Single,300000 - 349999,1,1,1
1,221303200000.0,42500.0,0.0,0.0,Single,,0,0,0
2,221303200000.0,27500.0,0.0,15.0,Married,75000 - 99999,1,0,1
3,221303200000.0,80372.176,0.0,0.0,,1000 - 24999,1,0,0
4,221303200000.0,125000.0,0.0,0.0,,,0,0,1
5,221303100000.0,70000.0,1.0,14.0,Married,100000 - 124999,1,0,1
6,221303100000.0,87500.0,1.0,3.0,Single,75000 - 99999,1,0,1
7,221303100000.0,62500.0,1.0,5.0,Married,50000 - 74999,1,0,1
8,221303200000.0,125000.0,0.0,3.0,Married,75000 - 99999,1,1,1
9,221303200000.0,42500.0,1.0,5.0,,75000 - 99999,0,1,1


In [9]:
# Check information for demographic_df
demographic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2112579 entries, 0 to 2112578
Data columns (total 9 columns):
 #   Column               Dtype  
---  ------               -----  
 0   INDIVIDUAL_ID        float64
 1   INCOME               float64
 2   HAS_CHILDREN         float64
 3   LENGTH_OF_RESIDENCE  float64
 4   MARITAL_STATUS       object 
 5   HOME_MARKET_VALUE    object 
 6   HOME_OWNER           int64  
 7   COLLEGE_DEGREE       int64  
 8   GOOD_CREDIT          int64  
dtypes: float64(4), int64(3), object(2)
memory usage: 145.1+ MB


In [10]:
# Check null values
demographic_df.isna().sum()

INDIVIDUAL_ID               0
INCOME                      0
HAS_CHILDREN                0
LENGTH_OF_RESIDENCE         0
MARITAL_STATUS         431648
HOME_MARKET_VALUE      190205
HOME_OWNER                  0
COLLEGE_DEGREE              0
GOOD_CREDIT                 0
dtype: int64

In [11]:
# Read in termination.csv as a dataframe
termination_df = pd.read_csv('Data/termination.csv')

termination_df.head(10)

Unnamed: 0,INDIVIDUAL_ID,ACCT_SUSPD_DATE
0,221302600000.0,2022-10-09
1,221302800000.0,2022-04-24
2,221302700000.0,2022-05-21
3,221300200000.0,2022-04-27
4,221302600000.0,2022-09-16
5,221302600000.0,2022-09-27
6,221302400000.0,2022-11-23
7,221301300000.0,2022-10-30
8,221302400000.0,2022-04-24
9,221301200000.0,2022-01-21


In [12]:
# Check information for termination_df
termination_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 269259 entries, 0 to 269258
Data columns (total 2 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   INDIVIDUAL_ID    269259 non-null  float64
 1   ACCT_SUSPD_DATE  269259 non-null  object 
dtypes: float64(1), object(1)
memory usage: 4.1+ MB


In [13]:
# Dataframe of total rows from each CSV
total_entries = pd.DataFrame(data=[len(address_df), len(customer_df),
                                   len(demographic_df), len(termination_df)],
                             index=['address_df', 'customer_df', 'demographic_df', 'termination_df'],
                             columns=['total_entries'])

total_entries

Unnamed: 0,total_entries
address_df,1536673
customer_df,2280321
demographic_df,2112579
termination_df,269259


### Observations from initial check
Compared to customer_df, both address_df and demographic_df are incomplete
   * Joining onto customer_df will result in at least 100K rows that are null in key columns
   * Unless a large portion of termination_df has customers that only exist in customer_df then demographic_df might be best choice to merge other dfs on

#### termination_df
termination_df only has entries if the customer churned; use existence of ids here to make 'churned' column.

#### address_df
Depending on if the lats/lon are complete bogus or at least match the city + county, nulls can be filled using that information.

#### customer_df
* Missing id entry particularly puzzling
* AGE_IN_YEARS can be calculated from DOB column; then DOB column is superfluous

#### demographic_df
* HOME_MARKET_VALUE nulls seem to be a result of being a renter; use HOME_OWNER to check
* MARITAL_STATUS is more complicated; 'Unknown' is always possible, but check correlation with other columns to see if there's a potential fill method

## Data Cleaning

Before merging:
1) Check strange entry in customer_df
2) Fill in missing AGE_IN_YEARS
3) Check if missing lat/lon can be extrapolated from data, fill if possible
4) Check HOME_MARKET_VALUE to make sure null are from non-home owners

In [14]:
# Examine entry without INDIVIDUAL_ID value in customer_df
customer_df[customer_df['INDIVIDUAL_ID'].isna()]

Unnamed: 0,INDIVIDUAL_ID,ADDRESS_ID,CURR_ANN_AMT,DAYS_TENURE,CUST_ORIG_DATE,AGE_IN_YEARS,DATE_OF_BIRTH,SOCIAL_SECURITY_NUMBER
1219186,,,660.852375,2810.0,2015-03-24,,1985-03-05,829-XX-8228


In [15]:
# Remove entry
customer_df = customer_df.dropna(subset=['INDIVIDUAL_ID'])

In [16]:
# Change DATE_OF_BIRTH into datetime format
customer_df['DATE_OF_BIRTH'] = pd.to_datetime(customer_df['DATE_OF_BIRTH'], format='%Y-%m-%d')

In [17]:
# Fill in AGE_IN_YEARS from DATE_OF_BIRTH
origin_date = pd.to_datetime('2023-04-01', format='%Y-%m-%d')
customer_df['AGE_IN_YEARS'] = customer_df['AGE_IN_YEARS'].fillna(
    (origin_date - customer_df['DATE_OF_BIRTH']).dt.total_seconds() / (60*60*24*365.25))

In [18]:
# Round values in AGE_IN_YEARS to integer
customer_df['AGE_IN_YEARS'] = customer_df['AGE_IN_YEARS'].apply(np.round)

In [19]:
# Check null values
customer_df.isna().sum()

INDIVIDUAL_ID             0
ADDRESS_ID                0
CURR_ANN_AMT              0
DAYS_TENURE               0
CUST_ORIG_DATE            0
AGE_IN_YEARS              0
DATE_OF_BIRTH             0
SOCIAL_SECURITY_NUMBER    0
dtype: int64

In [20]:
customer_df.head(10)

Unnamed: 0,INDIVIDUAL_ID,ADDRESS_ID,CURR_ANN_AMT,DAYS_TENURE,CUST_ORIG_DATE,AGE_IN_YEARS,DATE_OF_BIRTH,SOCIAL_SECURITY_NUMBER
0,221300000000.0,521300000000.0,818.877997,1454.0,2018-12-09,44.0,1978-06-23,608-XX-7640
1,221300100000.0,521300100000.0,974.199182,1795.0,2018-01-02,73.0,1950-05-30,342-XX-6908
2,221300700000.0,521300200000.0,967.375112,4818.0,2009-09-23,55.0,1967-07-07,240-XX-9224
3,221301600000.0,521300600000.0,992.409561,130.0,2022-07-25,54.0,1969-05-25,775-XX-6249
4,221301600000.0,521300600000.0,784.633494,5896.0,2006-10-11,50.0,1972-09-25,629-XX-7298
5,221302700000.0,521301000000.0,909.916163,484.0,2021-08-05,33.0,1990-04-20,194-XX-6050
6,221300700000.0,521300300000.0,1084.048271,2896.0,2014-12-28,71.0,1952-02-29,653-XX-7262
7,221302200000.0,521300800000.0,1277.370767,4096.0,2011-09-15,46.0,1976-12-24,163-XX-1670
8,221302900000.0,521301100000.0,917.201036,2107.0,2017-02-24,56.0,1966-07-27,621-XX-5572
9,221301200000.0,521300500000.0,1095.390747,3387.0,2013-08-24,64.0,1958-08-28,793-XX-1178


In [21]:
# Drop DOB and SSN columns
customer_df = customer_df.drop(columns=['DATE_OF_BIRTH', 'SOCIAL_SECURITY_NUMBER'])

In [22]:
# Check if nulls in CITY are same as those in COUNTY
address_df[address_df['CITY'].isna() & address_df['COUNTY'].isna()]

Unnamed: 0,ADDRESS_ID,LATITUDE,LONGITUDE,STREET_ADDRESS,CITY,STATE,COUNTY
121,5.213015e+11,32.903031,-96.793966,8889 Stephen Fall Apt. 958,,TX,
221,5.213015e+11,33.036290,-97.004109,973 Patrick Estate Suite 574,,TX,
227,5.213015e+11,32.862242,-97.137152,"PSC 4330, Box 0301",,TX,
257,5.213015e+11,32.862247,-96.885227,Unit 6008 Box 0517,,TX,
274,5.213015e+11,33.207406,-97.141951,Unit 0245 Box 6737,,TX,
...,...,...,...,...,...,...,...
1536346,5.213015e+11,32.751197,-97.325779,262 Anna Throughway Suite 150,,TX,
1536503,5.213015e+11,32.864806,-96.882668,94571 Debra Bypass Suite 491,,TX,
1536509,5.213015e+11,32.917007,-96.740901,441 Donna Village Suite 688,,TX,
1536566,5.213010e+11,32.656155,-97.174315,0468 Timothy Crossing,,TX,


Testing the latitudes and longitude for a few points suggests that those values accurately map to the correct city and county. Since the main goal for the latitude and longitude columns is potential use with Tableau where precision to within the correct county and city is the only thing required, those values can be estimated as long as that connection is preserved.

Similarly for entries without a city and county, those values can be filled using information provided in the dataframe. Using a geolocation API would provide the most accurate results (and would likely be more straightforward), but for the purposes of this analysis using the data already on hand should be suitable.

In [23]:
# Group entries that don't have nulls on CITY and COUNTY
# Aggregate with min/max of latitude + longitude range, as well as average
locations = address_df.dropna().groupby(['COUNTY', 'CITY']).agg({
    'LATITUDE': ['min', 'max', 'mean'],
    'LONGITUDE': ['min', 'max', 'mean']
})

locations

Unnamed: 0_level_0,Unnamed: 1_level_0,LATITUDE,LATITUDE,LATITUDE,LONGITUDE,LONGITUDE,LONGITUDE
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,min,max,mean
COUNTY,CITY,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Collin,Allen,33.024457,33.150113,33.101216,-96.742174,-96.530944,-96.657203
Collin,Anna,33.283661,33.408346,33.350362,-96.651238,-96.395428,-96.547855
Collin,Blue Ridge,33.236130,33.405859,33.308241,-96.478335,-96.295602,-96.395658
Collin,Celina,33.259998,33.407501,33.324521,-96.882502,-96.617977,-96.766285
Collin,Dallas,32.974941,33.021052,32.998869,-96.882485,-96.760770,-96.821026
...,...,...,...,...,...,...,...
Tarrant,Kennedale,32.608999,32.669161,32.645814,-97.243174,-97.180523,-97.211783
Tarrant,Mansfield,32.511006,32.625666,32.583582,-97.229940,-97.044064,-97.127472
Tarrant,Naval Air Station Jrb,32.759304,32.795340,32.775486,-97.435495,-97.412294,-97.423869
Tarrant,North Richland Hills,32.804314,32.921442,32.863415,-97.263437,-97.176821,-97.218635


In [24]:
# Iterate over rows of locations
for index, row in locations.iterrows():
    
    # Fill null city if lat/long is between min/max values for a row in locations
    address_df.loc[(address_df['CITY'].isna())
                  & (address_df['LATITUDE'] >= row[0]) & (address_df['LATITUDE'] <= row[1])
                  & (address_df['LONGITUDE'] >= row[3]) & (address_df['LONGITUDE'] >= row[4]), 'CITY'] = index[1]

In [25]:
address_df.isna().sum()

ADDRESS_ID             0
LATITUDE          129961
LONGITUDE         129961
STREET_ADDRESS         0
CITY                2372
STATE                  0
COUNTY             15434
dtype: int64

In [26]:
# Iterate over rows of locations
for index, row in locations.iterrows():
    
    # Fill null county if lat/long is between min/max values for a row in locations
    address_df.loc[(address_df['COUNTY'].isna())
                  & (address_df['LATITUDE'] >= row[0]) & (address_df['LATITUDE'] <= row[1])
                  & (address_df['LONGITUDE'] >= row[3]) & (address_df['LONGITUDE'] >= row[4]), 'COUNTY'] = index[0]

In [27]:
# Iterate over rows of locations
for index, row in locations.iterrows():
    
    # Fill in null latitude with mean for city/county combo
    address_df.loc[(address_df['LATITUDE'].isna())
                   & (address_df['CITY'] == index[1])
                   & (address_df['COUNTY'] == index[0]), ['LATITUDE', 'LONGITUDE']] = [row[2], row[5]]

In [28]:
address_df.isna().sum()

ADDRESS_ID           0
LATITUDE          2372
LONGITUDE         2372
STREET_ADDRESS       0
CITY              2372
STATE                0
COUNTY            2372
dtype: int64

In [29]:
# Drop remaining entries with incomplete data
address_df = address_df.dropna()

In [30]:
# Drop STREET_ADDRESS column
address_df = address_df.drop(columns='STREET_ADDRESS')

In [31]:
address_df.head(10)

Unnamed: 0,ADDRESS_ID,LATITUDE,LONGITUDE,CITY,STATE,COUNTY
0,521301100000.0,32.315803,-96.627896,Ennis,TX,Ellis
1,521300000000.0,32.843968,-96.965442,Irving,TX,Dallas
2,521300200000.0,32.80629,-96.779857,Dallas,TX,Dallas
3,521301300000.0,32.825737,-96.939687,Irving,TX,Dallas
4,521301000000.0,32.867192,-96.715552,Dallas,TX,Dallas
5,521301500000.0,33.055527,-96.705288,Plano,TX,Collin
6,521300600000.0,33.406005,-96.966034,Pilot Point,TX,Denton
7,521300900000.0,32.892217,-97.083184,Grapevine,TX,Tarrant
8,521300100000.0,32.858974,-96.649463,Dallas,TX,Dallas
9,521301400000.0,32.982515,-96.575038,Sachse,TX,Dallas


In [32]:
# Check if 190205 null values in HOME_MARKET_VALUE are from non-home owners
demographic_df[demographic_df['HOME_MARKET_VALUE'].isna() & (demographic_df['HOME_OWNER'] == 0)].isna().sum()

INDIVIDUAL_ID               0
INCOME                      0
HAS_CHILDREN                0
LENGTH_OF_RESIDENCE         0
MARITAL_STATUS          97341
HOME_MARKET_VALUE      183660
HOME_OWNER                  0
COLLEGE_DEGREE              0
GOOD_CREDIT                 0
dtype: int64

In [33]:
# Fill HOME_MARKET_VALUE with 'N/A' where HOME_OWNER is 0
demographic_df.loc[(demographic_df['HOME_MARKET_VALUE'].isna()) 
                   & (demographic_df['HOME_OWNER'] == 0), 'HOME_MARKET_VALUE'] = 'N/A'

In [34]:
demographic_df.isna().sum()

INDIVIDUAL_ID               0
INCOME                      0
HAS_CHILDREN                0
LENGTH_OF_RESIDENCE         0
MARITAL_STATUS         431648
HOME_MARKET_VALUE        6545
HOME_OWNER                  0
COLLEGE_DEGREE              0
GOOD_CREDIT                 0
dtype: int64

Remaining nulls in HOME_MARKET_VALUE will be examined in merged dataframe to weigh alternatives for filling.

## Merging Dataframes

In [35]:
# Left merge starting with demographic_df, then customer_df, address_df, termination_df
churn_df = (
    demographic_df.merge(customer_df, how='left')
        .merge(address_df, how='left')
        .merge(termination_df, how='left')
)

churn_df.head(10)

Unnamed: 0,INDIVIDUAL_ID,INCOME,HAS_CHILDREN,LENGTH_OF_RESIDENCE,MARITAL_STATUS,HOME_MARKET_VALUE,HOME_OWNER,COLLEGE_DEGREE,GOOD_CREDIT,ADDRESS_ID,CURR_ANN_AMT,DAYS_TENURE,CUST_ORIG_DATE,AGE_IN_YEARS,LATITUDE,LONGITUDE,CITY,STATE,COUNTY,ACCT_SUSPD_DATE
0,221302800000.0,125000.0,1.0,8.0,Single,300000 - 349999,1,1,1,521301000000.0,949.447656,4767.0,2009-11-13,83.0,32.492035,-96.361291,Scurry,TX,Kaufman,
1,221303200000.0,42500.0,0.0,0.0,Single,,0,0,0,521301400000.0,732.323969,1606.0,2018-07-10,55.0,32.687588,-97.129238,Arlington,TX,Tarrant,
2,221303200000.0,27500.0,0.0,15.0,Married,75000 - 99999,1,0,1,521301400000.0,1143.329062,4774.0,2009-11-06,83.0,32.748467,-96.47544,Forney,TX,Kaufman,
3,221303200000.0,80372.176,0.0,0.0,,1000 - 24999,1,0,0,521301400000.0,1030.842884,3597.0,2013-01-26,31.0,33.084836,-96.858913,The Colony,TX,Denton,
4,221303200000.0,125000.0,0.0,0.0,,,0,0,1,521301400000.0,1224.092836,723.0,2020-12-09,55.0,32.687588,-97.129238,Arlington,TX,Tarrant,
5,221303100000.0,70000.0,1.0,14.0,Married,100000 - 124999,1,0,1,521301400000.0,896.197651,6291.0,2005-09-11,50.0,32.830436,-96.759775,Dallas,TX,Dallas,
6,221303100000.0,87500.0,1.0,3.0,Single,75000 - 99999,1,0,1,521301400000.0,1027.144843,1204.0,2019-08-16,55.0,32.755547,-97.23573,Fort Worth,TX,Tarrant,
7,221303100000.0,62500.0,1.0,5.0,Married,50000 - 74999,1,0,1,521301400000.0,813.401926,5983.0,2006-07-16,59.0,33.080878,-96.669351,Allen,TX,Collin,
8,221303200000.0,125000.0,0.0,3.0,Married,75000 - 99999,1,1,1,521301400000.0,1126.795506,5308.0,2008-05-21,66.0,32.852216,-97.213551,North Richland Hills,TX,Tarrant,
9,221303200000.0,42500.0,1.0,5.0,,75000 - 99999,0,1,1,521301400000.0,796.800046,2673.0,2015-08-08,36.0,32.661237,-97.398141,Fort Worth,TX,Tarrant,


In [36]:
# Check new df info
churn_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2112579 entries, 0 to 2112578
Data columns (total 20 columns):
 #   Column               Dtype  
---  ------               -----  
 0   INDIVIDUAL_ID        float64
 1   INCOME               float64
 2   HAS_CHILDREN         float64
 3   LENGTH_OF_RESIDENCE  float64
 4   MARITAL_STATUS       object 
 5   HOME_MARKET_VALUE    object 
 6   HOME_OWNER           int64  
 7   COLLEGE_DEGREE       int64  
 8   GOOD_CREDIT          int64  
 9   ADDRESS_ID           float64
 10  CURR_ANN_AMT         float64
 11  DAYS_TENURE          float64
 12  CUST_ORIG_DATE       object 
 13  AGE_IN_YEARS         float64
 14  LATITUDE             float64
 15  LONGITUDE            float64
 16  CITY                 object 
 17  STATE                object 
 18  COUNTY               object 
 19  ACCT_SUSPD_DATE      object 
dtypes: float64(10), int64(3), object(7)
memory usage: 338.5+ MB


In [37]:
# Check nulls
churn_df.isna().sum()

INDIVIDUAL_ID                0
INCOME                       0
HAS_CHILDREN                 0
LENGTH_OF_RESIDENCE          0
MARITAL_STATUS          431648
HOME_MARKET_VALUE         6545
HOME_OWNER                   0
COLLEGE_DEGREE               0
GOOD_CREDIT                  0
ADDRESS_ID                  24
CURR_ANN_AMT                24
DAYS_TENURE                 24
CUST_ORIG_DATE              24
AGE_IN_YEARS                24
LATITUDE                  2621
LONGITUDE                 2621
CITY                      2621
STATE                     2621
COUNTY                    2621
ACCT_SUSPD_DATE        1863314
dtype: int64

In [38]:
# Create churned column from ACCT_SUSPD_DATE
churn_df['churned'] = np.where(churn_df['ACCT_SUSPD_DATE'].isna(), 0, 1)

churn_df['churned'].value_counts()

0    1863314
1     249265
Name: churned, dtype: int64

In [39]:
# Drop unnecessary columns
churn_df = churn_df.drop(columns=['ADDRESS_ID', 'ACCT_SUSPD_DATE'])

In [41]:
# Lowercase column names
churn_df = churn_df.rename(str.lower, axis=1)

churn_df.columns

Index(['individual_id', 'income', 'has_children', 'length_of_residence',
       'marital_status', 'home_market_value', 'home_owner', 'college_degree',
       'good_credit', 'curr_ann_amt', 'days_tenure', 'cust_orig_date',
       'age_in_years', 'latitude', 'longitude', 'city', 'state', 'county',
       'churned'],
      dtype='object')