In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn import linear_model
from sklearn.preprocessing import MinMaxScaler 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

In [2]:
df= pd.read_csv('https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv')

In [3]:
column_names = df.columns
df.shape

(4008, 11)

In [8]:
def cleaning(df):
    #columns appear to be categorical: ST Gender Education Policy type Vehicle Class
    cols = []
    for colname in df.columns:
        cols.append(colname.lower())
    df.columns = cols
    # Replace white spaces in column names with underscores
    df.columns = df.columns.str.replace(' ', '_')
    # Replace 'st' with 'state' in column names
    df= df.rename({'st': 'state'}, axis=1)
    # Clean Gender column
    char_to_replace = {"femal": 'F', 'male': 'M', "female": "F", "Fe": "F", "Fmal": "F", "Male": "M"}
    # Iterate over all key-value pairs in dictionary 
    for key, value in char_to_replace.items():
        # Replace key character with value character in string
        df['gender'] = df['gender'].str.replace(key, value)
    # Clean State column
    df['state'].value_counts()
    state_mapping = {'AZ': 'Arizona','Californiaforniaforniaforniafornia': 'California',  'Californiaforniaforniaforniaforniafornia': 'California', 'Cali': 'California', 'Californiafornia': 'California', 'WA': 'Washington'}
    for key, value in state_mapping.items():
        # Replace key character with value character in string
        df['state'] = df['state'].str.replace(key, value)
    # Clean education column
    df['education'].value_counts()
    df["education"] = df['education'].str.replace("Bachelors", "Bachelor")
    # Clean Customer Lifetime Value column
    df['customer_lifetime_value'] = df['customer_lifetime_value'].str.rstrip('%')
    # Clean Vehicle Class column
    vehicle_class_mapping = {'Sports Car': 'Luxury', 'Luxury SUV': 'Luxury', 'Luxury Car': 'Luxury'}
    # Iterate over all key-value pairs in dictionary 
    for key, value in vehicle_class_mapping.items():
        # Replace key character with value character in string
        df['vehicle_class'] = df['vehicle_class'].str.replace(key, value)
    # Customer lifetime value should be numeric
    df['customer_lifetime_value'] = pd.to_numeric(df['customer_lifetime_value'], errors='coerce')
    df['customer_lifetime_value'] = df['customer_lifetime_value'].astype(float)
    # Number of open complaints has an incorrect format.
    df["number_of_open_complaints"].value_counts()
    # Extract the number of open complaints from each entry and calculate the middle value
    df['number_of_open_complaints'] = df['number_of_open_complaints'].str.split('/').str[1]
    # Cast the column to the proper numeric type
    df['number_of_open_complaints'] = pd.to_numeric(df['number_of_open_complaints'], errors='coerce')
    df['number_of_open_complaints'] = df['number_of_open_complaints'].astype(float)
    round(df.isna().sum()/len(df),4)*100  # shows the percentage of null values in a column
    nulls_df = pd.DataFrame(round(df.isna().sum()/len(df),4)*100)
    nulls_df = nulls_df.reset_index()
    nulls_df.columns = ['header_name', 'percent_nulls']
    # Fill null values with column mean for numerical variables
    column_categorical=["state", "gender", "education", "policy_type", "vehicle_class"]
    column_numerical=["income", "monthly_premium_auto", "total_claim_amount", "customer_lifetime_value", "number_of_open_complaints"]
    numerical_columns = df.select_dtypes(include=['number']).columns
    df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].mean())
    categorical_columns = df.select_dtypes(include=['object']).columns
    df[categorical_columns] = df[categorical_columns].fillna(df[categorical_columns].mode().iloc[0])
    # Step 1: Identify duplicate rows
    duplicates = df.duplicated()
    df= df.drop_duplicates(keep='first') #to not lose more data, I prefer to keep the first duplicate.
    df = df.reset_index(drop=True)

    return df
# Step 6: Save the cleaned dataset to a new CSV file
df.to_csv('cleaned_dataset.csv', index=False)

# Print the cleaned DataFrame
print("Cleaned DataFrame:")
cleaning(df)
#print(df.dtypes)
# df["number_of_open_complaints"].value_counts()
# print(df.shape)
# df.isna().sum()

Cleaned DataFrame:


number_of_open_complaints
1/0/00    830
1/1/00    138
1/2/00     50
1/3/00     34
1/4/00     13
1/5/00      6
Name: count, dtype: int64

In [None]:
summary = df.describe().T
def roundforme(x):
    return round(x,2)

summary['mean'] = list(map(roundforme, summary['mean']))

# summary
for col in summary.columns:
    summary[col] = summary[col].apply(lambda x : round(x, 2))
summary

In [None]:
df.describe().T

In [None]:
df['monthly_premium_auto'].unique()
for i, x in enumerate(df['monthly_premium_auto']):
    if x > 400:
        df.at[i, 'monthly_premium_auto'] =193.23436
df['monthly_premium_auto'].unique()

In [None]:
df.describe(include='object').T

In [None]:
import scipy.stats as stats

In [None]:
df['gender'].value_counts()

In [None]:
#!pip3 install seaborn if you need to
import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline
# Task 3: Show a plot of the Gender breakdown
gender_counts = df['gender'].value_counts()
gender_counts.plot(kind='bar', color=['blue', 'pink'])
plt.title('Gender Breakdown')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.show()
#The number of females is higher than males with over 620 F comparing to 452 M

In [None]:
# everything can be a bar chart, but that is a sad sad world
sns.barplot(x="state", y="income", data=df)

# Adjust the font size of the state value labels
plt.xticks(fontsize=8) 
plt.title('Income by state')
plt.xlabel('State')
plt.ylabel('Income')
plt.show()
# individuals in Washington make more income than any other place while Nevada makes the least 

In [None]:
df['state'].value_counts()

In [None]:
# everything can be a bar chart, but that is a sad sad world
sns.barplot(x="policy_type", y="total_claim_amount", data=df)

# Adjust the font size of the state value labels
plt.xticks(fontsize=10) 
plt.title('Policy type by total claim amount')
plt.xlabel('Policy type')
plt.ylabel('Total claim amount')
plt.show()
 # People with corporate autos have total claim the most and spacial autos has the least

In [None]:
# Create the count plot
plt.figure(figsize=(10, 6))  # Adjust the figure size as needed
sns.countplot(x="education", hue="vehicle_class", data=df)
plt.title('Education by Vehicle Class')
plt.xlabel('Education')
plt.ylabel('Count')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability

# Show the plot
plt.legend(title='Vehicle Class', bbox_to_anchor=(1.05, 1), loc='upper left')  # Adjust legend position
plt.tight_layout()  # Adjust layout to prevent clipping of labels
plt.show()

#the majority of individuals no mater their education level have four-door cars with the people who have a Bachelor degrees score 
# the highest at over 175 and people with a doctoral degree score the least at 20. While luxury cars are hard to get for all groups of 
# different educational levels.

In [None]:
#What other plots do you feel would be beneficial?
# state vs total_claim_amount to see which states are complaining the most
# education vs income to see which groups make more income
# gender vs income to look at the income gap
# gender vs vehicle_class to see the gap 
# gender vs total_claim_amount to see which gender is complaining the most

In [None]:
numerical = df.select_dtypes(include=np.number)
categorical = df.select_dtypes(include=object)

In [None]:
numerical.columns

In [None]:
for var in numerical:
    plt.figure(figsize=(8, 6))
    sns.histplot(data=df, x=var, kde=True, color='skyblue', bins=30)
    plt.title(f'Distribution of {var}')
    plt.xlabel(var)
    plt.ylabel('Count')
    plt.show()

In [None]:
for var in numerical:
    plt.figure(figsize=(8, 6))  # Adjust figure size as needed
    plt.hist(df[var], bins=30, color='skyblue', edgecolor='black')  
    plt.title(f'Histogram of {var}')
    plt.xlabel(var)
    plt.ylabel('Count')
    plt.grid(axis='y', alpha=0.75)
    plt.show()

==> Total claim amounts + monthly premium auto looks like normal distribution skewed to the right

In [None]:
correlations_matrix = numerical.corr()
sns.heatmap(correlations_matrix, annot=True)
plt.show()

==> it looks fine to me


In [None]:
y = df['total_claim_amount']
X = df.drop(['total_claim_amount'], axis=1)

X_num = df.select_dtypes(include = np.number)
X_cat = df.select_dtypes(include = object)

In [None]:
X_num

In [None]:
X_cat
X_cat.drop(columns="customer", inplace=True)

In [None]:
# scaling standard scaler: make data distributed with mean=0 and std=1
from sklearn.preprocessing import StandardScaler
transformer = StandardScaler().fit(X_num)
X_standardized = transformer.transform(X_num)
print(X_standardized.shape)
X_standardized = pd.DataFrame(X_standardized,columns=X_num.columns)

In [None]:
#one hot encoding is a way to turn categorical variables into multiple numerical columns
encoder = OneHotEncoder(drop='first').fit(X_cat)
print(encoder.categories_)
encoded = encoder.transform(X_cat).toarray()
print(encoded)
onehot_encoded = pd.DataFrame(encoded,columns=['California', 'Nevada', 'Oregon', 'Washington', 'M', 'College', 'Doctor', 'High School or Below', 'Master', 'Personal Auto', 'Special Auto', 'Luxury', 'SUV', 'Two-Door Car'])
onehot_encoded.head(20)

In [None]:
# let's merge all this information together into a single dataset with all features
X1 = pd.concat([X_standardized, onehot_encoded ], axis=1)  # np.concatenate()
X1

In [None]:
X= X1.drop(['total_claim_amount'], axis=1)
y = X1['total_claim_amount']
y.head()
# train test split:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
#we train/fit our model 
lm = linear_model.LinearRegression()
lm.fit(X_train,y_train)

In [None]:
predictions = lm.predict(X_train)
r2_score(y_train, predictions)

In [None]:
predictions_test = lm.predict(X_test)
r2_score(y_test, predictions_test)

In [None]:
mse=mean_squared_error(y_test,predictions_test)
mse

In [None]:
rmse = np.sqrt(mean_squared_error(y_test,predictions_test))
rmse
rmse = math.sqrt(mse)
rmse

In [None]:
mae = mean_absolute_error(y_test, predictions_test)
print(mae)

In [None]:
lm.score(X_test, y_test)

In [None]:
r2 = r2_score(y_test, predictions_test)
r2

In [None]:
# Some approaches you can try in this exercise:
# use the concept of multicollinearity and remove insignificant variables
# use a different method of scaling the numerical variables
# use a different ratio of train test split
# use the transformation on numerical columns which align it more towards a normal distribution

In [None]:
X= X1.drop(['total_claim_amount', 'number_of_open_complaints'], axis=1)
y = X1['total_claim_amount']
y.head()
# train test split:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape

In [None]:
#we train/fit our model 
lm = linear_model.LinearRegression()
lm.fit(X_train,y_train)

In [None]:
predictions = lm.predict(X_train)
r2_score(y_train, predictions)

In [None]:
predictions_test = lm.predict(X_test)
r2_score(y_test, predictions_test)

In [None]:
# change strategy to improve the model
transformer = MinMaxScaler().fit(X_num)
X_norm = transformer.transform(X_num)
print(X_norm.shape)
X_num_scale = pd.DataFrame(X_norm, columns=X_num.columns)
X_num_scale.head()

In [None]:
# let's merge all this information together into a single dataset with all features
X1 = pd.concat([X_num_scale, onehot_encoded ], axis=1)  # np.concatenate()
X1

In [None]:
X= X1.drop(['total_claim_amount', 'number_of_open_complaints'], axis=1)
y = X1['total_claim_amount']
y.head()
# train test split:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=90)
lm = linear_model.LinearRegression()
lm.fit(X_train,y_train)
predictions = lm.predict(X_train)
r2_score(y_train, predictions)

In [None]:
predictions_test = lm.predict(X_test)
r2_score(y_test, predictions_test)

In [None]:
mse=mean_squared_error(y_test,predictions_test)
mse

In [None]:
rmse = np.sqrt(mean_squared_error(y_test,predictions_test))
rmse
rmse = math.sqrt(mse)
rmse

In [None]:
# the final results show an R2 of 0.58 for the trained model and 0.44 for the predicted model. MSE is equal to 0.006 and RMSE= 0.077