#### 1. Check the data types of the columns. Get the numeric data into dataframe called numerical and categorical columns in a dataframe called categoricals. (You can use np.number and np.object to select the numerical data types and categorical data types respectively)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency 

# Read the file and standardize the columns
df = pd.read_csv('files_for_lab/csv_files/marketing_customer_analysis.csv')
df.rename(columns = {'EmploymentStatus': 'Employment Status'}, inplace = True)
df.columns = df.columns.str.lower()

print(df.dtypes)

numericals = df.select_dtypes(['float', 'integer'])
categoricals = df.select_dtypes('object')

#### 2. Now we will try to check the normality of the numerical variables visually:

##### 2.1. Use seaborn library to construct distribution plots for the numerical variables

In [None]:
sns.displot(numericals['customer lifetime value'])
sns.displot(numericals['income'])
sns.displot(numericals['monthly premium auto'])
sns.displot(numericals['months since last claim'])
sns.displot(numericals['months since policy inception'])
sns.displot(numericals['number of open complaints'])
sns.displot(numericals['number of policies'])
sns.displot(numericals['total claim amount']) 

##### 2.2. Use Matplotlib to construct histograms

In [None]:
fig, ax = plt.subplots(nrows = 8, ncols = 1, figsize = (5, 30))

plt.subplot(8,1,1)
plt.hist(numericals['months since last claim'], bins = 35)
plt.xlabel('months since last claim')
plt.ylabel('frequency')

plt.subplot(8,1,2)
plt.hist(numericals['number of open complaints'], bins = 5)
plt.xlabel('number of open complaints')
plt.ylabel('frequency')

plt.subplot(8,1,3)
plt.hist(numericals['customer lifetime value'], bins = 10)
plt.xlabel('customer lifetime value')
plt.ylabel('frequency')

plt.subplot(8,1,4)
plt.hist(numericals['income'], bins = 20)
plt.xlabel('income')
plt.ylabel('frequency')

plt.subplot(8,1,5)
plt.hist(numericals['monthly premium auto'], bins = 20)
plt.xlabel('monthly premium auto')
plt.ylabel('frequency')

plt.subplot(8,1,6)
plt.hist(numericals['number of policies'], bins = 8)
plt.xlabel('number of policies')
plt.ylabel('frequency')

plt.subplot(8,1,7)
plt.hist(numericals['total claim amount'], bins = 100)
plt.xlabel('total claim amount')
plt.ylabel('frequency')

plt.subplot(8,1,8)
plt.hist(numericals['months since policy inception'], bins = 100)
plt.xlabel('months since policy inception')
plt.ylabel('frequency')

plt.show()

##### 2.3. Do the distributions for different numerical variables look like a normal distribution?

The majority of the distributions seem be logarithmic, except for the total claim amount.

#### 3. For the numerical variables, check the multicollinearity between the features. Please note that we will use the column `total_claim_amount` later as the target variable.

In [None]:
# Use Pearson correlation matrix to find collinearity

# Create correlation matrix
corr = numericals.corr()

# Set up the matplotlib plot configuration
fig, ax = plt.subplots(figsize = (12, 10))

# Generate a mask for upper traingle
mask = np.triu(np.ones_like(corr, dtype = bool))

# Draw the heatmap
sns.heatmap(corr, annot = True, mask = mask)

plt.show()

There's no obvious strong correlation between the numerical features, so we can move forward and perform the Chi test for the `state` column:

In [None]:
print(categoricals.columns)

# state & response:
st_resp = pd.crosstab(categoricals['state'], categoricals['response'])
chi2, p, dof, array = chi2_contingency(st_resp)
print("1.", p)

# state & coverage:
st_resp = pd.crosstab(categoricals['state'], categoricals['coverage'])
chi2, p, dof, array = chi2_contingency(st_resp)
print("2.", p)

# state & education:
st_resp = pd.crosstab(categoricals['state'], categoricals['education'])
chi2, p, dof, array = chi2_contingency(st_resp)
print("3.", p)

# state & employment status:
st_resp = pd.crosstab(categoricals['state'], categoricals['employment status'])
chi2, p, dof, array = chi2_contingency(st_resp)
print("4.", p)

# state & gender:
st_resp = pd.crosstab(categoricals['state'], categoricals['gender'])
chi2, p, dof, array = chi2_contingency(st_resp)
print("5.", p)

# state & location code:
st_resp = pd.crosstab(categoricals['state'], categoricals['location code'])
chi2, p, dof, array = chi2_contingency(st_resp)
print("6.", p)

# state & marital status:
st_resp = pd.crosstab(categoricals['state'], categoricals['marital status'])
chi2, p, dof, array = chi2_contingency(st_resp)
print("7.", p)

# state & policy type:
st_resp = pd.crosstab(categoricals['state'], categoricals['policy type'])
chi2, p, dof, array = chi2_contingency(st_resp)
print("8.", p)

# state & policy:
st_resp = pd.crosstab(categoricals['state'], categoricals['policy'])
chi2, p, dof, array = chi2_contingency(st_resp)
print("9.", p)

# state & renew offer type:
st_resp = pd.crosstab(categoricals['state'], categoricals['renew offer type'])
chi2, p, dof, array = chi2_contingency(st_resp)
print("10.", p)

# state & sales channel:
st_resp = pd.crosstab(categoricals['state'], categoricals['sales channel'])
chi2, p, dof, array = chi2_contingency(st_resp)
print("11.", p)

# state & vehicle size:
st_resp = pd.crosstab(categoricals['state'], categoricals['vehicle size'])
chi2, p, dof, array = chi2_contingency(st_resp)
print("12.", p)

# state & vehicle class:
st_resp = pd.crosstab(categoricals['state'], categoricals['vehicle class'])
chi2, p, dof, array = chi2_contingency(st_resp)
print("13.", p)

It seems that the policy & policy type are strongly correlated with the state (possibly because there are specific policies for each state), so we reject the null hypothesis for those.

We can also do more targeted tests, e.g. for vehicle class & vehicle size:

In [None]:
# vehicle class & vehicle size
st_resp = pd.crosstab(categoricals['vehicle size'], categoricals['vehicle class'])
chi2, p, dof, array = chi2_contingency(st_resp)
print("1.", p)

# policy & policy type
st_resp = pd.crosstab(categoricals['policy'], categoricals['policy type'])
chi2, p, dof, array = chi2_contingency(st_resp)
print("2.", p)

#### 4. Drop one of the two features that show a high correlation between them (greater than 0.9). Write code for both the correlation matrix and for seaborn heatmap. If there is no pair of features that have a high correlation, then do not drop any features.

We can reject the null-hypothesis for the pairs we saw above so we can drop any of the columns from that pair:

In [None]:
# Keep the columns with the most detail, i.e. policy type & vehicle class
df.drop(['policy', 'vehicle size'], axis = 1, inplace = True)

# Check columns were dropped
print(df.columns)