# Data Source:
Kaggle (ongoing competition): https://www.kaggle.com/c/prudential-life-insurance-assessment

# Project goals:

**Analyze features collected from individuals applying for life insurance at Prudential to predict response of the company.**

Motivation for the problem:

Once Prudential collects features from its applicants, it takes a long time to come up with a decision. If we can come up with a model that accurately predicts the decisions they come up with, the company can use this model and swiftly determine its decision.

# Notes on dataset:

**59381 rows,  127 columns**: 126 features (some of which are dummy variables) and 1 outcome column. The outcome column is called "Response". There are 8 nominal values (1,2,3,4,5,6,7,8) in the Response column.

In [None]:
# import packages and magic here
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import bokeh
% matplotlib inline
import seaborn as sns

In [None]:
# globals here

In [None]:
# read input data set
input_csv = "./train.csv"

df_raw = pd.read_csv(input_csv,index_col=0)
df_raw.head()

In [None]:
# Identify predictor varibales and response variable and create corresponding dfs


In [None]:
# check for null values
df_raw.isnull().sum()

In [None]:
# Check for unique output column
df_raw.Response.unique()

In [None]:
# Look into the employment column
emp_info = ['Employment_Info_1','Employment_Info_2','Employment_Info_3','Employment_Info_4','Employment_Info_5',
               'Employment_Info_6']
df_raw.ix[2:10,emp_info]

In [None]:
# Check for correlations among the employment_info columns
for ii in emp_info:
    plt.scatter(df_raw.ix[:,ii],df_raw.ix[:,'Employment_Info_1'])
    plt.show()    

In [None]:
# Check for correlations among the employment_info columns
for ii in range(0,len(emp_info)):
    for jj in range(ii+1,len(emp_info)):
        #print "{} vs {}".format(emp_info[ii],emp_info[jj])
        plt.scatter(df_raw.ix[:,emp_info[ii]],df_raw.ix[:,emp_info[jj]])
        plt.show()

In [None]:
# To clearly compute nulls and plot distributions for each feature

for each_feature in df_raw.columns:
    num_uniques = len(df_raw[each_feature].unique())
    num_nulls = df_raw[each_feature].isnull().sum()
    print "{}: uniques = {}, nulls = {}".format(each_feature,num_uniques,num_nulls)
    try:
        sns.distplot(df_raw[each_feature].dropna(how=any))
        plt.show()
    except:
        pass    

In [None]:
df2 = df_raw
for each_feature in df2.columns:
    num_nulls = df2[each_feature].isnull().sum()
    print "nulls in {}: {}".format(each_feature,num_nulls)
    if num_nulls !=0:
        df2.drop([rows for rows in df2[each_feature] if rows=="NaN"],axis=0,inplace=True)
        new_num_nulls = df2[each_feature].isnull().sum()
        print "new nulls in {}: {}".format(each_feature,new_num_nulls)
        
#         df_raw.drop
#         print "after dropping nulls, length of {}: {}".format(each_feature,len(df_raw[each_feature]))            

In [None]:
# create a new list that adds
string_variables = []
for each_feature in range(0,len(df_raw.columns)):
    print type(df_raw.iloc[2,each_feature])

In [None]:
type(df_raw.Product_Info_2[2])

In [None]:
df_raw.Product_Info_2.unique()

In [None]:
# Get dummy variables for Product_info2
prod_info_2 = pd.get_dummies(df_raw['Product_Info_2'],prefix="Product_Info_2")
prod_info_2

In [None]:
# Drop Product_Info_2
df_raw.drop(['Product_Info_2'],axis=1,inplace=True)

In [None]:
df_expanded = pd.concat([df_raw,prod_info_2],axis=1,join="inner")

In [None]:
# Define

pred_variables = df_expanded.columns[:-1]
pred_variables

In [None]:
df_expanded.columns[-1]

In [None]:
predictors = []
for ii in df_expanded.columns:
    if ii != "Response":
        predictors.append(ii)

In [None]:
predictors

In [None]:
predictor_df = df_expanded.ix[:,predictors]
corr_df = predictor_df.corr()

In [None]:
corr_df.head()

In [None]:
_corr_threshold = 0.5

for pred in predictors:
    df_temp = corr_df[pred]
    max_corr = df_temp[[idx for idx in df_temp.index if idx not in pred not in idx]].max()
    min_corr = df_temp.min()
    if np.isnan(max_corr)==False:
        if abs(min_corr)>max_corr:
            other_pred = df_temp[df_temp==min_corr].index[0]
            mcorr = min_corr
        else:    
            other_pred = df_temp[df_temp==max_corr].index[0]
            mcorr = max_corr
        if abs(mcorr)>_corr_threshold:
              print '**',
        print 'max_corr({})={:.4f} with {}'.format(pred,mcorr,other_pred)        

In [None]:
# Hedge for multicollinearity

