## Purpose and Observation

### Purpose:

   - Check missing values
   - Create Required variables (Target, etc)
   - Treat missing values (SimpleImputer used for continuous variables, dropping for remaining NaN values)
   - export 'Dataset_model' to be used for model building
       

### Improvements that may be made in the notebook:
- Missing values for continuous variables can also be treated using KNNImputer, after bucketing.
    - Utilized SimpleImputer to treat missing values in continuous variables using mean
- LabelEncoding introduces bias while using KNNImputer, One-Hot Encoding can be used for more efficiency

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import csv

from datetime import datetime
from datetime import date

from sklearn.impute import SimpleImputer

## Read data and Cleaning

In [None]:
#import cleaned dataset
df_merged = pd.read_csv('Merged_clean.csv')

df_merged["RCD"] = pd.to_datetime(df_merged["RCD"])

In [None]:
#commenting cleaning code
""" Mentioning cleaning codes, remove quotations if dont have clean data file


df_merged=pd.read_csv('merged.csv')


#Marital_status, Own_Edu, Occupation_Group fix
df_merged.replace(['N', 'N.A', 'MISSING'], np.nan, inplace=True )


#STATNAME fix and Focus_region fix

objectlistm = list(df_merged.select_dtypes('object').columns)
for col in objectlistm:
    df_merged[col] = df_merged[col].str.upper()


# putting them under SOUTH
df_merged['Focus_region'].replace(['KKG','ANDHRA','TAMIL NADU'], 'SOUTH', inplace = True)

# RCD and LA_DOB fix
df_merged["LA_DOB"]= pd.to_datetime(df_merged["LA_DOB"])
df_merged["RCD"] = pd.to_datetime(df_merged["RCD"])


#Float dtype fix
floatlist = list(df_merged.select_dtypes('float').columns)
display(df_merged.loc[:, floatlist])

for col in floatlist:
    df_merged.loc[:,col] = df_merged.loc[:,col].apply(np.ceil)
    if df_merged.loc[:,col].isna().sum() == 0:
        df_merged[col] = df_merged[col].astype('int64')

#drop Unnamed:0
x= "Unnamed: 0"

df_merged.drop([x], axis=1, inplace=True)

"""

## Descriptive stats for reference

In [None]:
pd.options.display.float_format = "{:.2f}".format

display(df_merged.describe())
display(df_merged.info())

## Check Missing values

In [None]:
#check all records with atleast one missing value
nulls = df_merged.isnull().any(axis=1).sum()
print("records with missing values: ", nulls)
print("percentage records with missing values: ", 100*nulls/len(df_merged))

display(df_merged[df_merged.isnull().any(axis=1)])

# Creating Dataset

- Create Target and other required variables
- Sort values by policy_owner_number, RCD
- Treat missing values


## Creating variables

#### 1. Convert LA_DOB to age

In [None]:
#function to find age
def calc_age(born):
    born= datetime.strptime(born, '%Y-%m-%d').date()
    today=date.today()
    return today.year - born.year - ((today.month, today.day) < (born.month, born.day))

In [None]:
#get age
df_merged['age'] = df_merged['LA_DOB'].apply(calc_age)
#drop LA_DOB
df_merged.drop('LA_DOB', axis=1, inplace=True)

#### 2. Create Target Variable

In [None]:
condition = [df_merged['Freq'] > 1, df_merged['Freq'] == 1]
cases = [1, 0]

df_merged['target'] = np.select(condition, cases)

#### 3. Make changes to Product_brief_category

In [None]:
#convert 'SAFAL JEEVAN' to 'TRADITIONAL'
df_merged['Product_brief_category'] = df_merged['Product_brief_category'].replace('SAFAL JEEVAN', 'TRADITIONAL')

In [None]:
#check creation of above variables
display(df_merged.head())
display(df_merged['Product_brief_category'].value_counts())

## Treat Missing Values

### 1. Treat Continuous Variables: Owner_salary

In [None]:
#merged
imp = SimpleImputer(missing_values = np.nan, strategy = 'mean')
df_merged['Owner_salary'] = imp.fit_transform(df_merged[['Owner_salary']])

display(df_merged['Owner_salary'])

## 2. Drop remaining NaN vals

In [None]:
features_with_na=[features for features in df_merged.columns if df_merged[features].isnull().sum() >1]

for feature in features_with_na:
    print(feature, np.round(df_merged[feature].isnull().mean()*100,4), ' %missing values')

In [None]:
df_merged.dropna(inplace = True)

# Create and Export 'Dataset_model' to be used for Model Building

In [None]:
#export this complete dataframe
df_merged.to_csv('Merged_clean_and_dropped.csv', index = False)

In [None]:
#create dataset to model on
df_merged.sort_values(['policy_owner_number', 'RCD'], inplace=True)
df_merged.reset_index(drop=True, inplace=True)
dataset=df_merged.drop_duplicates(subset='policy_owner_number', keep='first')
display(dataset)

#export
dataset.to_csv('Dataset_model.csv', index = False)