In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor, plot_tree
from sklearn.metrics import mean_squared_error

In [None]:
df = pd.read_csv('train.csv')
df.head()

In [None]:
print("before:", df.shape)
df = df.drop_duplicates()
print("After:", df.shape)

In [None]:
# convert objects to date type
df.DateReported = pd.to_datetime(df.DateReported, format='ISO8601')
df.DateTimeOfAccident = pd.to_datetime(df.DateTimeOfAccident, format='ISO8601')
print(df.DateReported.dtype, df.DateTimeOfAccident.dtype)
df.head()

In [None]:
df.isna().sum()

In [None]:
df.MaritalStatus.unique()

In [None]:
print("before:", df.shape)
df.dropna(axis='index', inplace=True)
print("After:", df.shape)

In [None]:
# replace nan values with unknown 
df.MaritalStatus = df.MaritalStatus.fillna('Unknown')
df.MaritalStatus.unique()

In [None]:
df.skew(numeric_only=True)

In [None]:
df.hist(figsize=(15, 10))
plt.show()

In [None]:
df.boxplot(figsize=(15, 10))
plt.show()

In [38]:
def cap_outliers(series, lower_percentile=0.05, upper_percentile=0.95):
    lower_bound = series.quantile(lower_percentile)
    upper_bound = series.quantile(upper_percentile)
    return series.clip(lower_bound, upper_bound)

In [39]:
columns_to_cap = list(df.select_dtypes(include=['number']).columns)

for column in columns_to_cap:
    df[column] = cap_outliers(df[column])

In [None]:
df.hist(figsize=(15, 10))
plt.show()

In [None]:
df.boxplot(figsize=(15, 10))
plt.show()

In [None]:
for col in columns_to_cap:
    df[col].plot.box(figsize=(8, 5))  # Adjust figsize as needed
    plt.title(col)  # Add title for each boxplot
    plt.show()
    plt.clf()  # Clear the plot for the next iteration

In [None]:
df.skew(numeric_only=True)

In [None]:
plt.figure(figsize=(10, 6))  # Optional: Set the figure size
plt.hist(df['DateTimeOfAccident'], bins=100, edgecolor='black')
plt.xlabel('Date and Time of Accident')
plt.ylabel('Frequency')
plt.title('Accident Date and Time Distribution')
# plt.grid(True)  # Optional: Add grid lines
plt.show()

In [None]:
(df.DateReported - df.DateTimeOfAccident).dt.days.lt(0).sum()

In [None]:
# get data where difference between reported and time is negative 
df[(df.DateReported - df.DateTimeOfAccident).dt.days.lt(0)]

In [12]:
df['DateOfAccident'] = df.DateTimeOfAccident.dt.date
df['TimeOfAccident'] = df.DateTimeOfAccident.dt.time

datetime_col_index = df.columns.get_loc('DateTimeOfAccident')

if not isinstance(datetime_col_index, int):
    raise ValueError("`datetime_col_index` must be an integer.")

df.insert(datetime_col_index + 1, 'DateOfAccident', df.pop('DateOfAccident'))
df.insert(datetime_col_index + 2, 'TimeOfAccident', df.pop('TimeOfAccident'))

In [None]:
df.dtypes

In [None]:
# convert objects to date type
df.DateOfAccident = pd.to_datetime(df.DateOfAccident, format='ISO8601', utc=True)
print(df.DateOfAccident.dtype, df.TimeOfAccident.dtype)
df.dtypes

In [None]:
# some accidents were reported on the same day but date column doesn't have time
(df.DateReported - df.DateOfAccident).dt.days.lt(0).sum()

In [None]:
hour_values = [t.hour for t in df['TimeOfAccident']]

plt.figure(figsize=(10, 6))
plt.hist(hour_values, bins=24, edgecolor='black')
plt.xlabel('Hour of Accident')
plt.ylabel('Frequency')
plt.title('Accident Hour Distribution')
plt.show()

In [None]:
df.describe()

In [None]:
df.describe(include=['object'])

In [None]:
df.Gender.unique()

In [None]:
# Group by 'TimeOfAccident' and calculate the average age
grouped_data = df.groupby('TimeOfAccident')['Age'].mean()

# Plot the results
grouped_data.plot(kind='bar', title='Average Age by Time of Accident')
plt.xlabel('Time of Accident')
plt.ylabel('Average Age')
plt.show()

In [None]:
df.columns

In [22]:
df_numeric = df.select_dtypes(include='number')

In [None]:
correlation_matrix = df_numeric.corr()

# Create a mask to hide the upper triangle
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))

# Create the heatmap
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", mask=mask)

# Set the title and labels
plt.title("Correlation Matrix")
plt.xlabel("Features")
plt.ylabel("Features")
plt.show()

In [None]:
df.columns

In [None]:
df_encoded = pd.get_dummies(
    df[
        [
            # "DateTimeOfAccident",
            # "DateOfAccident",
            # "DateReported",
            "Age",
            "Gender",
            "MaritalStatus",
            "DependentChildren",
            "DependentsOther",
            "WeeklyWages",
            "PartTimeFullTime",
            "HoursWorkedPerWeek",
            "DaysWorkedPerWeek",
            "InitialIncurredCalimsCost",
            "UltimateIncurredClaimCost",
        ]
    ],
)
df_encoded.head()

In [159]:
X = df_encoded.drop(columns=['UltimateIncurredClaimCost'])
y = df_encoded.UltimateIncurredClaimCost

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
model = XGBRegressor()
model.fit(X_train, y_train)

In [None]:
plt.figure(figsize=(20, 8))  # Adjust the width and height as needed
plot_tree(model, num_trees=0)
plt.show()

In [None]:
model.score(X_test, y_test)

In [None]:
y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE:", rmse)

In [None]:
model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

# Access training and evaluation results
results = model.evals_result()
rmse = results['validation_0']['rmse'][0]
print("RMSE:", rmse)

In [None]:
plt.scatter(y_test, y_pred)
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.title("Actual vs. Predicted Values")
plt.show()

In [None]:
plt.scatter(df_encoded.index, df_encoded.UltimateIncurredClaimCost)
plt.gca().ticklabel_format(axis='y', style='plain', useOffset=False)
plt.xlabel("Index")
plt.ylabel("Ultimate Incurred Claim Cost")
plt.title("Ultimate Incurred Claim Cost")
plt.show()

In [None]:
# Filter the DataFrame based on your condition
filtered_df = df_encoded[df_encoded.UltimateIncurredClaimCost < 3500000]

# Extract the index and filtered column
filtered_index = filtered_df.index
filtered_cost = filtered_df["UltimateIncurredClaimCost"]

plt.scatter(filtered_index, filtered_cost)
plt.gca().ticklabel_format(axis='y', style='plain', useOffset=False)
plt.xlabel("Index")
plt.ylabel("Ultimate Incurred Claim Cost")
plt.title("Ultimate Incurred Claim Cost (Below 3.5 Million)")
plt.show()

In [None]:
# Extract feature importance
feature_importance = model.feature_importances_

# Print the feature importance scores
for feature, importance in zip(X_train.columns, feature_importance):
    print(f"{feature}: {importance:.2f}")

In [None]:
# Plot the feature importance scores
plt.bar(X_train.columns, feature_importance)
plt.xlabel("Features")
plt.ylabel("Importance")
plt.title("Feature Importance")
plt.xticks(rotation=90)
plt.show()