For the proposed project, I want to compare the insurance rates of various catagories of different insurance providers, for example age, marital status of the person, number of dependents of the person, location of house. This project will be very helpful for a person to make his family plans: such when to get married, when to have baby, in which state to buy a house for settlement according to her/his finanical status.

I would also like to propose to do the time series analysis of the insurance rates that will be helpful to predict the
insurance rates of various companies in the market in future. I will do this using autoregressive integrated moving average (ARIMA) model which is more accurate in forecasting. 

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import numpy as np
import pandas as pd

params = {'legend.fontsize': 14,
          'axes.labelsize': 16,
          'axes.titlesize':16,
          'xtick.labelsize':14,
          'ytick.labelsize':14,
          'figure.figsize':(13, 7),
          'axes.grid':True}
plt.rcParams.update(params)

#Read the csv file for insurance rates
file1 = 'Rate.csv'
df = pd.read_csv(file1)

#See how many years of data we have
df['BusinessYear'].unique()

#Group the data according to year
df_2014 = df[df['BusinessYear']==2014]
df_2015 = df[df['BusinessYear']==2015]
df_2016 = df[df['BusinessYear']==2016]

#Collect some necessary features from the data for different years
#I am intersted in comparing the insurance rates for individual, couple and couple with one dependent
#also I want to compare those rates with age and in different years.
comp_data_2014 = df_2014[['Age','IndividualRate','Couple','CoupleAndOneDependent']]
comp_data_2015 = df_2015[['Age','IndividualRate','Couple','CoupleAndOneDependent']]
comp_data_2016 = df_2016[['Age','IndividualRate','Couple','CoupleAndOneDependent']]

#Collect the insurance rates for various age groups and for different years
# Individual Rate for age group 21-64 for less than $8,000
df_age21_64_14 = comp_data_2014[(comp_data_2014['Age'] != 'Family Option') & (comp_data_2014['Age'] != '0-20') & (comp_data_2014['Age'] != '65 and over')]
df_age21_64_15 = comp_data_2015[(comp_data_2015['Age'] != 'Family Option') & (comp_data_2015['Age'] != '0-20') & (comp_data_2015['Age'] != '65 and over')]
df_age21_64_16 = comp_data_2016[(comp_data_2016['Age'] != 'Family Option') & (comp_data_2016['Age'] != '0-20') & (comp_data_2016['Age'] != '65 and over')]

## Individual Rate for age group '0-20' for less than $8,000
df_age0_20_14 = comp_data_2014[comp_data_2014['Age'] == '0-20']
df_age0_20_15 = comp_data_2015[comp_data_2015['Age'] == '0-20']
df_age0_20_16 = comp_data_2016[comp_data_2016['Age'] == '0-20']

# Individual Rate for age group '65 and over' for less than $8,000
df_age65_14 = comp_data_2014[comp_data_2014['Age'] == '65 and over']
df_age65_15 = comp_data_2015[comp_data_2015['Age'] == '65 and over']
df_age65_16 = comp_data_2016[comp_data_2016['Age'] == '65 and over']

#Make the histograms to compare the insurance rates for different age groups in three years 
# Individual rate for age group 21-59
ind_rate14 = df_age21_64_14['IndividualRate']
ind_rate15 = df_age21_64_15['IndividualRate']
ind_rate16 = df_age21_64_16['IndividualRate']

y14, x14 = np.histogram(ind_rate14[ind_rate14<8.e3], bins=100)
y15, x15 = np.histogram(ind_rate15[ind_rate15<8.e3], bins=100)
y16, x16 = np.histogram(ind_rate16[ind_rate16<8.e3], bins=100)

plt.figure()
plt.step(x14[:-1],y14, color='r', label='2014', lw=1.5)
plt.step(x15[:-1],y15, color='b', label='2015', lw=1.5)
plt.step(x16[:-1],y16, color='g', label='2016', lw=1.5)
plt.yscale('log')
plt.xlabel("Individual Rate ($/month)",fontsize=16,color='k')
plt.ylabel("Frequency",fontsize=16,color='k')
plt.legend(frameon=False, loc='upper right')
plt.title('Age Group [21-64]')

# Individual rate for age group 0-20
ind_rate14 = df_age0_20_14['IndividualRate']
ind_rate15 = df_age0_20_15['IndividualRate']
ind_rate16 = df_age0_20_16['IndividualRate']

y14, x14 = np.histogram(ind_rate14[ind_rate14<8.e3], bins=100)
y15, x15 = np.histogram(ind_rate15[ind_rate15<8.e3], bins=100)
y16, x16 = np.histogram(ind_rate16[ind_rate16<8.e3], bins=100)

plt.figure()
plt.step(x14[:-1],y14, color='r', label='2014', lw=1.5)
plt.step(x15[:-1],y15, color='b', label='2015', lw=1.5)
plt.step(x16[:-1],y16, color='g', label='2016', lw=1.5)
plt.yscale('log')
plt.xlabel("Individual Rate ($/month)",fontsize=16,color='k')
plt.ylabel("Frequency",fontsize=16,color='k')
plt.legend(frameon=False, loc='upper right')
plt.title('Age Group [0-20]')

# Individual rate for age group 65 and above
ind_rate14 = df_age65_14['IndividualRate']
ind_rate15 = df_age65_15['IndividualRate']
ind_rate16 = df_age65_16['IndividualRate']

y14, x14 = np.histogram(ind_rate14[ind_rate14<8.e3], bins=100)
y15, x15 = np.histogram(ind_rate15[ind_rate15<8.e3], bins=100)
y16, x16 = np.histogram(ind_rate16[ind_rate16<8.e3], bins=100)

plt.figure()
plt.step(x14[:-1],y14, color='r', label='2014', lw=1.5)
plt.step(x15[:-1],y15, color='b', label='2015', lw=1.5)
plt.step(x16[:-1],y16, color='g', label='2016', lw=1.5)
plt.yscale('log')
plt.xlabel("Individual Rate ($/month)",fontsize=16,color='k')
plt.ylabel("Frequency",fontsize=16,color='k')
plt.legend(frameon=False, loc='upper right')
plt.title('Age Group [65 and above]')
