# Data Cleaning 

In [1]:
#import relevant Libraries 
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 


sns.set()

In [2]:
raw_data = pd.read_csv('Datasets/InsuranceData.csv')

raw_data.head()

Unnamed: 0.1,Unnamed: 0,Customer Id,YearOfObservation,Insured_Period,Residential,Building_Painted,Building_Fenced,Garden,Settlement,Building Dimension,Building_Type,Date_of_Occupancy,NumberOfWindows,Geo_Code,Claim
0,0,H14663,2013,1.0,0,N,V,V,U,290.0,1,1960.0,.,1053,0.0
1,1,H2037,2015,1.0,0,V,N,O,R,490.0,1,1850.0,4,1053,0.0
2,2,H3802,2014,1.0,0,N,V,V,U,595.0,1,1960.0,.,1053,0.0
3,3,H3834,2013,1.0,0,V,V,V,U,2840.0,1,1960.0,.,1053,0.0
4,4,H5053,2014,1.0,0,V,N,O,R,680.0,1,1800.0,3,1053,0.0


In [3]:
data = raw_data.copy()

# drop index and customer_id column
data = data.drop(['Unnamed: 0','Customer Id'], axis=1)

data.reset_index(drop=True, inplace=True)

In [4]:
data.tail()

Unnamed: 0,YearOfObservation,Insured_Period,Residential,Building_Painted,Building_Fenced,Garden,Settlement,Building Dimension,Building_Type,Date_of_Occupancy,NumberOfWindows,Geo_Code,Claim
10224,2015,1.0,0,V,V,V,U,,4,1900.0,.,,
10225,2012,1.0,0,V,V,V,U,,2,1948.0,.,,
10226,2012,1.0,0,V,V,V,U,,2,1993.0,.,,
10227,2013,1.0,0,V,V,V,U,,1,1800.0,.,,
10228,2012,1.0,0,V,V,V,U,,2,1950.0,.,,


In [5]:
#check the descriptives 
data.describe(include='all')

Unnamed: 0,YearOfObservation,Insured_Period,Residential,Building_Painted,Building_Fenced,Garden,Settlement,Building Dimension,Building_Type,Date_of_Occupancy,NumberOfWindows,Geo_Code,Claim
count,10229.0,10229.0,10229.0,10229,10229,10218,10229,10110.0,10229.0,8993.0,10229,10114.0,7160.0
unique,,,,2,2,2,2,,,,11,1525.0,
top,,,,V,V,V,U,,,,.,6088.0,
freq,,,,7066,5792,5791,5790,,,,5791,238.0,
mean,2013.646789,0.913672,0.281064,,,,,1818.152918,2.238049,1965.061715,,,0.228212
std,1.38066,0.233806,0.44954,,,,,2272.111284,0.961589,33.548619,,,0.419709
min,2012.0,0.0,0.0,,,,,1.0,1.0,1545.0,,,0.0
25%,2012.0,1.0,0.0,,,,,500.0,2.0,1960.0,,,0.0
50%,2013.0,1.0,0.0,,,,,1002.0,2.0,1970.0,,,0.0
75%,2015.0,1.0,1.0,,,,,2190.0,3.0,1980.0,,,0.0


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10229 entries, 0 to 10228
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   YearOfObservation   10229 non-null  int64  
 1   Insured_Period      10229 non-null  float64
 2   Residential         10229 non-null  int64  
 3   Building_Painted    10229 non-null  object 
 4   Building_Fenced     10229 non-null  object 
 5   Garden              10218 non-null  object 
 6   Settlement          10229 non-null  object 
 7   Building Dimension  10110 non-null  float64
 8   Building_Type       10229 non-null  int64  
 9   Date_of_Occupancy   8993 non-null   float64
 10  NumberOfWindows     10229 non-null  object 
 11  Geo_Code            10114 non-null  object 
 12  Claim               7160 non-null   float64
dtypes: float64(4), int64(3), object(6)
memory usage: 1.0+ MB


In [7]:
#check for missing values 
data.isna().sum()

YearOfObservation        0
Insured_Period           0
Residential              0
Building_Painted         0
Building_Fenced          0
Garden                  11
Settlement               0
Building Dimension     119
Building_Type            0
Date_of_Occupancy     1236
NumberOfWindows          0
Geo_Code               115
Claim                 3069
dtype: int64

In [8]:
## Data Statistics 
print(f'Data Shape : \t {data.shape} \n')
print(f'% Of NAN Values :\n \n{data.isna().sum()/len(data)*100}\n')
print(f'Sum Of Duplicate Values :\t {data.duplicated().sum()}')

Data Shape : 	 (10229, 13) 

% Of NAN Values :
 
YearOfObservation      0.000000
Insured_Period         0.000000
Residential            0.000000
Building_Painted       0.000000
Building_Fenced        0.000000
Garden                 0.107537
Settlement             0.000000
Building Dimension     1.163359
Building_Type          0.000000
Date_of_Occupancy     12.083293
NumberOfWindows        0.000000
Geo_Code               1.124255
Claim                 30.002933
dtype: float64

Sum Of Duplicate Values :	 34


Note: Dealing with Missing values 
1. remove duplicates 
3. Replace the null values with mean for categorical and with mean for numerical data


In [9]:
# removing duplicates rows 
data.drop_duplicates(inplace = True)

In [19]:
# check for the mode of the columns 'Garden'
data['Garden'].mode()

0    V
Name: Garden, dtype: object

In [20]:
# check for the mode of the columns 'Building Dimension'
data['Building Dimension'].mean()

1819.6649141950145

In [21]:
# check for the mode of the columns 'Date_of_Occupancy'
data['Date_of_Occupancy'].mode()

0    1960.0
Name: Date_of_Occupancy, dtype: float64

In [23]:
# check for the mode of the columns 'Geo_Code'
data["Geo_Code"].mode()

0    6088
Name: Geo_Code, dtype: object

In [22]:
# check for the mode of the columns 'Claim'
data['Claim'].mode()

0    0.0
Name: Claim, dtype: float64

In [28]:
values = {"Garden":'V',
          "Building Dimension":1819.6649141950145,
          "Date_of_Occupancy":1960.0,
          "Geo_Code":6088,
          "Claim":0.0}

In [29]:
# Replace all NaN value;'Garden','Date_of_Occupancy','Geo_code','Claim' with the mode
# and 'Building Dimension' with the mean 

data.fillna(value=values, inplace=True)

Check for inconsistency in the data 

In [31]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10195 entries, 0 to 10227
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   YearOfObservation   10195 non-null  int64  
 1   Insured_Period      10195 non-null  float64
 2   Residential         10195 non-null  int64  
 3   Building_Painted    10195 non-null  object 
 4   Building_Fenced     10195 non-null  object 
 5   Garden              10195 non-null  object 
 6   Settlement          10195 non-null  object 
 7   Building Dimension  10195 non-null  float64
 8   Building_Type       10195 non-null  int64  
 9   Date_of_Occupancy   10195 non-null  float64
 10  NumberOfWindows     10195 non-null  object 
 11  Geo_Code            10195 non-null  object 
 12  Claim               10195 non-null  float64
dtypes: float64(4), int64(3), object(6)
memory usage: 1.1+ MB


In [32]:
data['YearOfObservation'].unique()

array([2013, 2015, 2014, 2012, 2016], dtype=int64)

In [33]:
data['Insured_Period'].unique()

array([1.        , 0.24109589, 0.84383562, 0.95628415, 0.99726776,
       0.08493151, 0.85753425, 0.29589041, 0.        , 0.78142077,
       0.91506849, 0.98630137, 0.16164384, 0.99726027, 0.09589041,
       0.66575343, 0.89315069, 0.02459016, 0.58082192, 0.83287671,
       0.26027397, 0.7479452 , 0.41803279, 0.10410959, 0.76986301,
       0.02465753, 0.47671233, 0.86575342, 0.55464481, 0.16393443,
       0.44808743, 0.50273224, 0.75136612, 0.08196721, 0.46849315,
       0.79726027, 0.31506849, 0.59452055, 0.74863388, 0.17534247,
       0.11780822, 0.12054795, 0.49589041, 0.91530055, 0.48087432,
       0.78082192, 0.02191781, 0.49453552, 0.2431694 , 0.24863388,
       0.69589041, 0.94794521, 0.91256831, 0.50410959, 0.82739726,
       0.52054795, 0.33333333, 0.58196721, 0.86885246, 0.57923497,
       0.83606557, 0.66393443, 0.98356164, 0.24657534, 0.99453552,
       0.74590164, 0.79178082, 0.19452055, 0.96721311, 0.41643836,
       0.95342466, 0.41369863, 0.25205479, 0.80273973, 0.98907

In [34]:
data['Residential'].unique()

array([0, 1], dtype=int64)

In [35]:
data['Building_Painted'].unique()

array(['N', 'V'], dtype=object)

In [36]:
data['Building_Fenced'].unique()

array(['V', 'N'], dtype=object)

In [37]:
data['Garden'].unique()

array(['V', 'O'], dtype=object)

In [38]:
data['Settlement'].unique()

array(['U', 'R'], dtype=object)

In [39]:
data['Building Dimension'].unique()

array([ 290.,  490.,  595., ..., 3530., 1888., 2575.])

In [40]:
data['Building_Type'].unique()

array([1, 2, 4, 3], dtype=int64)

In [41]:
data['Date_of_Occupancy'].unique()

array([1960., 1850., 1800., 1980., 1988., 2013., 2011., 1550., 1900.,
       2007., 1970., 1950., 1700., 1976., 1940., 1920., 1982., 1972.,
       2010., 1971., 1995., 2006., 1999., 1969., 1985., 1965., 1981.,
       1975., 1997., 2003., 1990., 1930., 1973., 1991., 1974., 1978.,
       1967., 1927., 1952., 1957., 2009., 1977., 1890., 1949., 1951.,
       2008., 1987., 2000., 1962., 1956., 1993., 1912., 1983., 1954.,
       1968., 2004., 1936., 1958., 1966., 1953., 1979., 1946., 1955.,
       2001., 1870., 1961., 2015., 1613., 1964., 1984., 2014., 2002.,
       1986., 1992., 2005., 2012., 1910., 1945., 1864., 1942., 1939.,
       1934., 1998., 1908., 1948., 1860., 1926., 1938., 1895., 1545.,
       1959., 1989., 1996., 1963., 1840., 2016., 1718., 1600., 1875.,
       1898., 1880., 1915., 1994., 1810., 1937., 1907., 1931., 1925.,
       1935., 1824., 1914., 1750., 1846., 1903., 1905., 1906., 1830.,
       1923., 1924., 1928., 1947., 1911., 1901., 1902., 1904., 1919.,
       1896., 1913.,

In [42]:
data['NumberOfWindows'].unique()

array(['   .', '4', '3', '2', '5', '>=10', '6', '7', '9', '8', '1'],
      dtype=object)

In [43]:
data['Geo_Code'].unique()

array(['1053', '1143', '1160', ..., '2B037', '2B298', '2B309'],
      dtype=object)

In [44]:
data['Claim'].unique()

array([0., 1.])

Dealing with inconsistencies

In [45]:
# Type Conversion
data['Insured_Period'] = pd.to_numeric(data['Insured_Period'])

# Filtering
dfLess50Percent = data[data['Insured_Period'] <= 0.50]
dfOver50Percent = data[data['Insured_Period'] > 0.50]

# Values Replacement
data['Insured_Period'].replace(dfLess50Percent['Insured_Period'].values, "0 - 5 Months", inplace=True)
data['Insured_Period'].replace(dfOver50Percent['Insured_Period'].values, "6 Months - 1 Year", inplace=True)

# Unique Values
data['Insured_Period'].unique()

array(['6 Months - 1 Year', '0 - 5 Months'], dtype=object)

In [58]:
# Replace incosistent '  .' with 4 the second mode and '>=10' with 10 since 10 is the highest value recorded 
data['NumberOfWindows'].replace({'   .':4, '>=10':10}, inplace=True)

# change to dtype to int64
data['NumberOfWindows'] = data['NumberOfWindows'].astype('int64')
#data['NumberOfWindows'].str.isdigit().sum()
#df = data[data['NumberOfWindows'].str.isdigit()]

In [62]:
#change dtype to 'int64'

data['Date_of_Occupancy'] = data['Date_of_Occupancy'].astype('int64')
data['Claim'] = data['Claim'].astype('int64')
data['Building Dimension'] = data['Building Dimension'].astype('int64')
data['']

In [64]:
# Replace (N-Painted, V-Not Painted)
data['Building_Painted'].replace('N', 'Painted', inplace=True)
data['Building_Painted'].replace('V', 'Not Painted', inplace=True)
# Replace (V-has garden; O-no garden)
data['Garden'].replace({'V':'Has Garden','O':'No Garden'}, inplace=True)
# Replace (N-Fenced, V-Not Fenced)
data['Building_Fenced'].replace({'N':'Fenced','V':'Not Fenced'},inplace=True)
# Replace (R- rural area; U- urban area)
data['Settlement'].replace({'R': 'Rural area','U': 'Urban area'},inplace=True)

In [68]:
#  Rename columns
data.rename({'Insured_Period':'Insurance Period', 
           'Building_Type':'Building Type', 
           'Building_Painted': 'Building Painted',
           'Building_Fenced' : 'Building Fenced',
           'NumberOfWindows' : 'Number of Windows',
           'Geo_Code': 'Geo Code',
           'Date_of_Occupancy':'Date Of Occupancy',
           'YearOfObservation':'Year of Observation'},
          axis=1, inplace=True)

In [74]:
Insurance_cleaned = data.copy()

In [76]:
Insurance_cleaned.to_csv('Insurance_cleaned.csv')

In [77]:
data

Unnamed: 0,Year of Observation,Insurance Period,Residential,Building Painted,Building Fenced,Garden,Settlement,Building Dimension,Building Type,Date Of Occupancy,Number of Windows,Geo Code,Claim
0,2013,6 Months - 1 Year,0,Painted,Not Fenced,Has Garden,Urban area,290,1,1960,4,1053,0
1,2015,6 Months - 1 Year,0,Not Painted,Fenced,No Garden,Rural area,490,1,1850,4,1053,0
2,2014,6 Months - 1 Year,0,Painted,Not Fenced,Has Garden,Urban area,595,1,1960,4,1053,0
3,2013,6 Months - 1 Year,0,Not Painted,Not Fenced,Has Garden,Urban area,2840,1,1960,4,1053,0
4,2014,6 Months - 1 Year,0,Not Painted,Fenced,No Garden,Rural area,680,1,1800,3,1053,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10223,2012,6 Months - 1 Year,0,Not Painted,Not Fenced,Has Garden,Urban area,1819,4,1900,4,6088,0
10224,2015,6 Months - 1 Year,0,Not Painted,Not Fenced,Has Garden,Urban area,1819,4,1900,4,6088,0
10225,2012,6 Months - 1 Year,0,Not Painted,Not Fenced,Has Garden,Urban area,1819,2,1948,4,6088,0
10226,2012,6 Months - 1 Year,0,Not Painted,Not Fenced,Has Garden,Urban area,1819,2,1993,4,6088,0
