In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import statsmodels.api as sm
from scipy.stats import boxcox
pd.options.display.max_rows = 100
## Install xlrd package to load Excel files
#!conda install openpyxl
#!conda install xlrd

In [4]:
#let's import the file created during activity 1 and run the info function

df = pd.read_csv('ca_df.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9135 entries, 0 to 9134
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   State                      9134 non-null   object 
 1   Gender                     9012 non-null   object 
 2   Education                  9134 non-null   object 
 3   Customer Lifetime Value    9127 non-null   float64
 4   Income                     9134 non-null   float64
 5   Monthly Premium Auto       9134 non-null   float64
 6   Number of Open Complaints  9134 non-null   float64
 7   Policy Type                9134 non-null   object 
 8   Vehicle Class              9134 non-null   object 
 9   Total Claim Amount         9134 non-null   float64
dtypes: float64(5), object(5)
memory usage: 713.8+ KB


#### Replacing null values – Replace missing values with means of the column (for numerical columns). Pay attention that the Income feature for instance has 0s which is equivalent to null values. (We assume here that there is no such income with 0 as it refers to missing values) Hint: numpy.nan is considered of float64 data type.

In [5]:
df

Unnamed: 0,State,Gender,Education,Customer Lifetime Value,Income,Monthly Premium Auto,Number of Open Complaints,Policy Type,Vehicle Class,Total Claim Amount
0,Washington,,Master,,0.0,1000.0,0.0,Personal Auto,Four-Door Car,2.704934
1,Arizona,Female,Bachelor,697954.0,0.0,94.0,0.0,Personal Auto,Four-Door Car,1131.464935
2,Nevada,Female,Bachelor,1288743.0,48767.0,108.0,0.0,Personal Auto,Two-Door Car,566.472247
3,California,Male,Bachelor,764586.0,0.0,106.0,0.0,Corporate Auto,SUV,529.881344
4,Washington,Male,High School or Below,536308.0,36357.0,68.0,0.0,Personal Auto,Four-Door Car,17.269323
...,...,...,...,...,...,...,...,...,...,...
9130,California,Male,Bachelor,23406.0,71941.0,73.0,0.0,Personal Auto,Four-Door Car,198.234764
9131,California,Female,College,3097.0,21604.0,79.0,0.0,Corporate Auto,Four-Door Car,379.200000
9132,California,Male,Bachelor,8164.0,0.0,85.0,3.0,Corporate Auto,Four-Door Car,790.784983
9133,California,Male,College,7524.0,21941.0,96.0,0.0,Personal Auto,Four-Door Car,691.200000


In [6]:
# lets set up a function that replaces Null or 0 values with the mean from the column

def Replace_null_values(Column_name):
    df[Column_name].replace(0, np.nan, inplace=True)
    df[Column_name] = df[Column_name].fillna(df[Column_name].mean())
    return df[Column_name]

In [7]:
Replace_null_values('Customer Lifetime Value')

#Replace_null_values('Number of Open Complaints') is excluded as 0 is a legit value

Replace_null_values('Income')

Replace_null_values('Monthly Premium Auto')

Replace_null_values('Total Claim Amount')

0          2.704934
1       1131.464935
2        566.472247
3        529.881344
4         17.269323
           ...     
9130     198.234764
9131     379.200000
9132     790.784983
9133     691.200000
9134     369.600000
Name: Total Claim Amount, Length: 9135, dtype: float64

In [8]:
# rounding up the values in the table for clarity

df = df.round()

#### Bucketing the data - Write a function to replace column "State" to different zones. California as West Region, Oregon as North West, and Washington as East, and Arizona and Nevada as Central


In [9]:
df.State.unique()

array(['Washington', 'Arizona', 'Nevada', 'California', 'Oregon', 'Cali',
       nan], dtype=object)

In [11]:
# function that replaces new values to old ones in a specific column

def df_old_new(to_replace, value, column_name):
    df[column_name].replace(to_replace, value, inplace=True)

In [12]:
# applying the function to replace States to different zones

df_old_new('California', 'West Region', 'State')
df_old_new('Cali', 'West Region', 'State')
df_old_new('Oregon', 'North West', 'State')
df_old_new('Washington', 'East', 'State')
df_old_new('Arizona', 'Central', 'State')
df_old_new('Nevada', 'Central', 'State')



In [13]:
df.State.unique()

array(['East', 'Central', 'West Region', 'North West', nan], dtype=object)

#### (Optional) In the column Vehicle Class, merge the two categories Luxury SUV and Luxury Car into one category named Luxury Vehicle

In [14]:
df['Vehicle Class'].unique()

array(['Four-Door Car', 'Two-Door Car', 'SUV', 'Luxury SUV', 'Sports Car',
       'Luxury Car', nan], dtype=object)

In [15]:
# applying the same function for changing the names in the Vehicle Class column

df_old_new('Luxury SUV', 'Luxury Vehicle','Vehicle Class')
df_old_new('Luxury Car', 'Luxury Vehicle','Vehicle Class')

In [16]:
df['Vehicle Class'].unique()


array(['Four-Door Car', 'Two-Door Car', 'SUV', 'Luxury Vehicle',
       'Sports Car', nan], dtype=object)

#### (Optional) Removing outliers using 1.5*IQR technique for all numerical columns.

In [41]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# this function establishes lower and upper bounds using the 1.5*IQR technique. 
# The bounds are then applyed to the numeric column to identify outliers.
# The outliers are subsequently dropped from the dataframe.

def outlier_treatment(datacolumn, dataframe):
    sorted(datacolumn)
    Q1,Q3 = np.percentile(datacolumn , [25,75])
    IQR = Q3 - Q1
    lower_range = Q1 - (1.5 * IQR)
    upper_range = Q3 + (1.5 * IQR)
    outliers = dataframe[(datacolumn < lower_range) | (datacolumn > upper_range)]
    return dataframe.drop(outliers.index , inplace=True)


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8688 entries, 0 to 9134
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   State                      8687 non-null   object 
 1   Gender                     8567 non-null   object 
 2   Education                  8687 non-null   object 
 3   Customer Lifetime Value    8688 non-null   float64
 4   Income                     8688 non-null   float64
 5   Monthly Premium Auto       8688 non-null   float64
 6   Number of Open Complaints  8687 non-null   float64
 7   Policy Type                8687 non-null   object 
 8   Vehicle Class              8687 non-null   object 
 9   Total Claim Amount         8688 non-null   float64
dtypes: float64(5), object(5)
memory usage: 746.6+ KB


In [None]:
#applying the outlier_treatment function to the numeric columns:

outlier_treatment(df['Customer Lifetime Value'], df)
outlier_treatment(df['Income'], df)
outlier_treatment(df['Monthly Premium Auto'], df)
#outlier_treatment(df['Number of Open Complaints'], df)
outlier_treatment(df['Total Claim Amount'], df)

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8466 entries, 2 to 9134
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   State                      8465 non-null   object 
 1   Gender                     8351 non-null   object 
 2   Education                  8465 non-null   object 
 3   Customer Lifetime Value    8466 non-null   float64
 4   Income                     8466 non-null   float64
 5   Monthly Premium Auto       8466 non-null   float64
 6   Number of Open Complaints  8465 non-null   float64
 7   Policy Type                8465 non-null   object 
 8   Vehicle Class              8465 non-null   object 
 9   Total Claim Amount         8466 non-null   float64
dtypes: float64(5), object(5)
memory usage: 727.5+ KB


In [None]:
#let's reset the index after dropping all these values

df.reset_index(drop=True, inplace=True)

#### (Optional) Standardizing the data – Use string functions to standardize the text data (lower case)

In [None]:
# the function goes through the dataframe and makes all str objects lowercase.

def lower_case_text(dataframe):
    
    dataframe = dataframe.applymap(lambda x: str(x).lower() if type(x) == str else x)
    
    return dataframe

lower_case_text(df)