In [1]:
import pandas as pd

In [2]:
df = pd.read_csv ('marketing_customer_analysis.csv')

In [3]:
# First of all, let's take a look to see our data frame

df

Unnamed: 0.1,Unnamed: 0,Customer,State,Customer Lifetime Value,Response,Coverage,Education,Effective To Date,EmploymentStatus,Gender,...,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size,Vehicle Type
0,0,DK49336,Arizona,4809.216960,No,Basic,College,2/18/11,Employed,M,...,0.0,9,Corporate Auto,Corporate L3,Offer3,Agent,292.800000,Four-Door Car,Medsize,
1,1,KX64629,California,2228.525238,No,Basic,College,1/18/11,Unemployed,F,...,0.0,1,Personal Auto,Personal L3,Offer4,Call Center,744.924331,Four-Door Car,Medsize,
2,2,LZ68649,Washington,14947.917300,No,Basic,Bachelor,2/10/11,Employed,M,...,0.0,2,Personal Auto,Personal L3,Offer3,Call Center,480.000000,SUV,Medsize,A
3,3,XL78013,Oregon,22332.439460,Yes,Extended,College,1/11/11,Employed,M,...,0.0,2,Corporate Auto,Corporate L3,Offer2,Branch,484.013411,Four-Door Car,Medsize,A
4,4,QA50777,Oregon,9025.067525,No,Premium,Bachelor,1/17/11,Medical Leave,F,...,,7,Personal Auto,Personal L2,Offer1,Branch,707.925645,Four-Door Car,Medsize,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10905,10905,FE99816,Nevada,15563.369440,No,Premium,Bachelor,1/19/11,Unemployed,F,...,,7,Personal Auto,Personal L1,Offer3,Web,1214.400000,Luxury Car,Medsize,A
10906,10906,KX53892,Oregon,5259.444853,No,Basic,College,1/6/11,Employed,F,...,0.0,6,Personal Auto,Personal L3,Offer2,Branch,273.018929,Four-Door Car,Medsize,A
10907,10907,TL39050,Arizona,23893.304100,No,Extended,Bachelor,2/6/11,Employed,F,...,0.0,2,Corporate Auto,Corporate L3,Offer1,Web,381.306996,Luxury SUV,Medsize,
10908,10908,WA60547,California,11971.977650,No,Premium,College,2/13/11,Employed,F,...,4.0,6,Personal Auto,Personal L1,Offer1,Branch,618.288849,SUV,Medsize,A


In [4]:
#Show the dataframe shape.

df.shape

(10910, 26)

In [5]:
# Standardize header names.

df.columns= df.columns.str.lower() # Put all column headers in lower case
df.columns = df.columns.str.replace(' ', '_') # Eliminate blank spaces
df.columns

Index(['unnamed:_0', 'customer', 'state', 'customer_lifetime_value',
       'response', 'coverage', 'education', 'effective_to_date',
       'employmentstatus', 'gender', 'income', 'location_code',
       'marital_status', 'monthly_premium_auto', 'months_since_last_claim',
       'months_since_policy_inception', 'number_of_open_complaints',
       'number_of_policies', 'policy_type', 'policy', 'renew_offer_type',
       'sales_channel', 'total_claim_amount', 'vehicle_class', 'vehicle_size',
       'vehicle_type'],
      dtype='object')

In [6]:
# Which columns are numerical? And categorical? First we check the data types of each column.

df.dtypes

unnamed:_0                         int64
customer                          object
state                             object
customer_lifetime_value          float64
response                          object
coverage                          object
education                         object
effective_to_date                 object
employmentstatus                  object
gender                            object
income                             int64
location_code                     object
marital_status                    object
monthly_premium_auto               int64
months_since_last_claim          float64
months_since_policy_inception      int64
number_of_open_complaints        float64
number_of_policies                 int64
policy_type                       object
policy                            object
renew_offer_type                  object
sales_channel                     object
total_claim_amount               float64
vehicle_class                     object
vehicle_size    

To be properly defined, the following variables should be:

- Numerical: unnamed:_0, customer_lifetime_value, number_of_open_complaints, income, monthly premiu auto, months since last claim, months since policy inception, number of policies, total_claim_amount
- Categorical: the rest 

In [7]:
# Check and deal with NaN values.

df.isnull().sum()

unnamed:_0                          0
customer                            0
state                             631
customer_lifetime_value             0
response                          631
coverage                            0
education                           0
effective_to_date                   0
employmentstatus                    0
gender                              0
income                              0
location_code                       0
marital_status                      0
monthly_premium_auto                0
months_since_last_claim           633
months_since_policy_inception       0
number_of_open_complaints         633
number_of_policies                  0
policy_type                         0
policy                              0
renew_offer_type                    0
sales_channel                       0
total_claim_amount                  0
vehicle_class                     622
vehicle_size                      622
vehicle_type                     5482
dtype: int64

In [8]:
# Calculate the mean and median and analyze the stdev of the numerical columns that have nulls to see if it has sense to substitute them for one of the values

# number_of_open_complaints
df_nooc_median = df['number_of_open_complaints'].median()
df_nooc_mean = df['number_of_open_complaints'].mean()
df_nooc_stdev = df['number_of_open_complaints'].std()
print(df_nooc_stdev) # As stdev is low, it is logical to substitute NaN by the mean.

0.9124571814676624


In [9]:
df['number_of_open_complaints'] = df['number_of_open_complaints'].fillna(df_nooc_median)

In [10]:
# months_since_last_claim
df_mslc_median = df['months_since_last_claim'].median()
df_mslc_mean = df['months_since_last_claim'].mean()
df_mslc_stdev = df['months_since_last_claim'].std()
print(df_mslc_stdev) # As stdev is quite high, it is logical to substitute NaN by the median.

10.080348786080851


In [11]:
df['months_since_last_claim'] = df['months_since_last_claim'].fillna(df_mslc_median)

In [12]:
# The null values in categorical columns will be transformed to 'unknown'

df['state'] = df['state'].fillna('unknown') #state
df['response'] = df['response'].fillna('unknown') #response
df['vehicle_class'] = df['vehicle_class'].fillna('unknown') #vehicle_class
df['vehicle_size'] = df['vehicle_size'].fillna('unknown') #vehicle_size
df['vehicle_type'] = df['vehicle_type'].fillna('unknown') #vehicle_type

In [13]:
# Check if all null values have been removed (False means there are no null values)

df.isnull().values.any()

False

In [14]:
# Now that we have dealed with the NaNs, let's transform the wrong data types:
# 'Effective to date' should be a datetime data type
# 'Months since last claim' should be an integer
# 'Number_of_open_complaints' should be an integer

df['effective_to_date']=pd.to_datetime(df['effective_to_date'], errors='coerce') 
df['number_of_open_complaints'] = df['number_of_open_complaints'].astype('int', errors='ignore')
df['months_since_last_claim'] = df['months_since_last_claim'].astype('int', errors='ignore')

In [15]:
df.dtypes

unnamed:_0                                int64
customer                                 object
state                                    object
customer_lifetime_value                 float64
response                                 object
coverage                                 object
education                                object
effective_to_date                datetime64[ns]
employmentstatus                         object
gender                                   object
income                                    int64
location_code                            object
marital_status                           object
monthly_premium_auto                      int64
months_since_last_claim                   int64
months_since_policy_inception             int64
number_of_open_complaints                 int64
number_of_policies                        int64
policy_type                              object
policy                                   object
renew_offer_type                        

In [16]:
# I decided to eliminate the column "unnamed:_0" as it brings no value, it's a copy of the automatic index.

df.drop(['unnamed:_0'], axis = 1, inplace = True) 

In [17]:
# Extract the months from the dataset and store them in a separate column.

df['month']=pd.DatetimeIndex(df['effective_to_date']).month

In [18]:
df.columns

Index(['customer', 'state', 'customer_lifetime_value', 'response', 'coverage',
       'education', 'effective_to_date', 'employmentstatus', 'gender',
       'income', 'location_code', 'marital_status', 'monthly_premium_auto',
       'months_since_last_claim', 'months_since_policy_inception',
       'number_of_open_complaints', 'number_of_policies', 'policy_type',
       'policy', 'renew_offer_type', 'sales_channel', 'total_claim_amount',
       'vehicle_class', 'vehicle_size', 'vehicle_type', 'month'],
      dtype='object')

In [22]:
# Filter the data to show only the information for the first quarter (we look only january and february because there is no data from march)

df_1stq = df.query('month <=2')

In [25]:
df_1stq['month'].unique() # There is no data from march

array([2, 1])

In [29]:
df_1stq

Unnamed: 0,customer,state,customer_lifetime_value,response,coverage,education,effective_to_date,employmentstatus,gender,income,...,number_of_policies,policy_type,policy,renew_offer_type,sales_channel,total_claim_amount,vehicle_class,vehicle_size,vehicle_type,month
0,DK49336,Arizona,4809.216960,No,Basic,College,2011-02-18,Employed,M,48029,...,9,Corporate Auto,Corporate L3,Offer3,Agent,292.800000,Four-Door Car,Medsize,unknown,2
1,KX64629,California,2228.525238,No,Basic,College,2011-01-18,Unemployed,F,0,...,1,Personal Auto,Personal L3,Offer4,Call Center,744.924331,Four-Door Car,Medsize,unknown,1
2,LZ68649,Washington,14947.917300,No,Basic,Bachelor,2011-02-10,Employed,M,22139,...,2,Personal Auto,Personal L3,Offer3,Call Center,480.000000,SUV,Medsize,A,2
3,XL78013,Oregon,22332.439460,Yes,Extended,College,2011-01-11,Employed,M,49078,...,2,Corporate Auto,Corporate L3,Offer2,Branch,484.013411,Four-Door Car,Medsize,A,1
4,QA50777,Oregon,9025.067525,No,Premium,Bachelor,2011-01-17,Medical Leave,F,23675,...,7,Personal Auto,Personal L2,Offer1,Branch,707.925645,Four-Door Car,Medsize,unknown,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10905,FE99816,Nevada,15563.369440,No,Premium,Bachelor,2011-01-19,Unemployed,F,0,...,7,Personal Auto,Personal L1,Offer3,Web,1214.400000,Luxury Car,Medsize,A,1
10906,KX53892,Oregon,5259.444853,No,Basic,College,2011-01-06,Employed,F,61146,...,6,Personal Auto,Personal L3,Offer2,Branch,273.018929,Four-Door Car,Medsize,A,1
10907,TL39050,Arizona,23893.304100,No,Extended,Bachelor,2011-02-06,Employed,F,39837,...,2,Corporate Auto,Corporate L3,Offer1,Web,381.306996,Luxury SUV,Medsize,unknown,2
10908,WA60547,California,11971.977650,No,Premium,College,2011-02-13,Employed,F,64195,...,6,Personal Auto,Personal L1,Offer1,Branch,618.288849,SUV,Medsize,A,2


In [27]:
# In order to define a function, I will read again the file to have the original df to work on

dfF = pd.read_csv ('marketing_customer_analysis.csv')

In [30]:
def clean(filename, numcolumn1, numcolumn2, catcolumn1, catcolumn2, catcolumn3, catcolumn4, catcolumn5, datecolumn):
    
    dfF = pd.read_csv (filename) #read file
    
    shape = df.shape #show df shape
    print(shape)
    
    df.columns= df.columns.str.lower() #put all column headers in lower case
    df.columns = df.columns.str.replace(' ', '_') #eliminate blank spaces
    nulls = df.isnull().sum() #show null values
    print(nulls) 
    
    df_numcolumn1_median = df[numcolumn1].median() #calculate median
    df[numcolumn1] = df[numcolumn1].fillna(df_column1_median) #I have simplified it, it always transforms nulls to the median.
    df_numcolumn2_median = df[numcolumn2].median() #calculate median
    df[numcolumn2] = df[numcolumn2].fillna(df_column2_median) #I have simplified it, it always transforms nulls to the median.
    df[catcolumn1] = df[catcolumn1].fillna('unknown') #nan of a categorical column to "unknown"
    df[catcolumn2] = df[catcolumn2].fillna('unknown') #nan of a categorical column to "unknown"
    df[catcolumn3] = df[catcolumn3].fillna('unknown') #nan of a categorical column to "unknown"
    df[catcolumn4] = df[catcolumn4].fillna('unknown') #nan of a categorical column to "unknown"
    df[catcolumn5] = df[catcolumn5].fillna('unknown') #nan of a categorical column to "unknown"
    df.isnull().values.any() #check that there are no nulls
    
    df[datecolumn]=pd.to_datetime(df[datecolumn], errors='coerce') #change data type
    df[numcolumn1] = df[numcolumn1].astype('int', errors='ignore') #change data type
    df[numcolumn2] = df[numcolumn2].astype('int', errors='ignore') #change data type
    
    df['month']=pd.DatetimeIndex(df[datecolumn]).month # Extract the months from the dataset and store them in a separate column.
    df_1stq = df.query('month <=3') #Filter data from 1st quarter and show new df
    print(df_1stq)
    
    

In [33]:
clean('marketing_customer_analysis.csv', 'number_of_open_complaints', 'months_since_last_claim', 'state', 'response', 'vehicle_class', 'vehicle_size', 'vehicle_type', 'effective_to_date')

FileNotFoundError: [Errno 2] No such file or directory: 'filename'