In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import quantile_transform
from sklearn.preprocessing import minmax_scale
from sklearn.preprocessing import OneHotEncoder  ##. better to use dummy from pandas 
from sklearn.preprocessing import PowerTransformer
from scipy.stats import boxcox
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from scipy.stats import boxcox
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
pd.options.display.max_rows = 50

Activity
(Tuseday)

#Linear Regression

-X-y split (y is the target variable, which is the total claim amount)

-Train-test split.

-Standardize the data (after the data split).

-Apply linear regression.

-Model Interpretation.

In [None]:
auto_customer_df=pd.read_csv("data/Data_Marketing_Customer_Analysis_Round3.csv")
auto_customer_df

In [None]:
sns.pairplot(auto_customer_df)

In [None]:
auto_numerical_df=auto_customer_df.select_dtypes(np.number)

In [None]:
corr = auto_numerical_df.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(9, 7))
    ax = sns.heatmap(corr, mask=mask,cmap='coolwarm', vmin=-1,vmax=1,annot=True, square=True)

In [None]:
auto_numerical_df.hist(figsize=(12,12))

In [None]:
pt = PowerTransformer()
customer_lifetime_value_transformed=pt.fit_transform(auto_numerical_df['customer_lifetime_value'].to_numpy().reshape(-1,1))
sns.displot(customer_lifetime_value_transformed)

In [None]:
auto_numerical_df["customer_lifetime_value_transformed"]= customer_lifetime_value_transformed
auto_numerical_df.drop(["customer_lifetime_value"],axis=1,inplace=True)

In [None]:
monthly_premium_auto_transformed=pt.fit_transform(auto_numerical_df['monthly_premium_auto'].to_numpy().reshape(-1,1))
sns.displot(customer_lifetime_value_transformed)

In [None]:
auto_numerical_df["monthly_premium_auto_transformed"]= monthly_premium_auto_transformed
auto_numerical_df.drop(["monthly_premium_auto"],axis=1,inplace=True)

In [None]:
auto_customer_df


In [None]:
cols_to_hot_encode=["region","coverage","education","employment_status","gender"]

In [None]:
cols_to_standardize=["customer_lifetime_value" , "monthly_premium_auto" ]

In [None]:
from sklearn.compose import ColumnTransformer

In [None]:
transformer = ColumnTransformer([
    ('standardize', StandardScaler(), cols_to_standardize),
    ('encode', OneHotEncoder(), cols_to_hot_encode)
], remainder = 'passthrough', verbose_feature_names_out=True)
transformer.fit(X_train)

In [None]:
X=transformer.drop('total_claim_amount', axis=1)
y=transformer.total_claim_amount

In [None]:
X=X._get_numeric_data()

In [None]:
X.hist(figsize=(14,14))

In [None]:
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=.30,random_state=123)


In [None]:
std_scaler=StandardScaler().fit(X_train)   ##. finding the parameters ( mean, variance from the training set )

X_train_scaled=std_scaler.transform(X_train)

In [None]:
X_train.shape

In [None]:
X_test_scaled=std_scaler.transform(X_test)

In [None]:
X_train_const_scaled = sm.add_constant(X_train_scaled) # adding a constant

model = sm.OLS(y_train, X_train_const_scaled).fit()
predictions_train = model.predict(X_train_const_scaled) 

X_test_const_scaled = sm.add_constant(X_test_scaled) # adding a constant
predictions_test = model.predict(X_test_const_scaled) 
print_model = model.summary()
print(print_model)

In [None]:
model=LinearRegression()    
model.fit(X_train_scaled, y_train)   

In [None]:
y_pred=model.predict(X_test_scaled)   # model prediction

y_pred_train=model.predict(X_train_scaled)

In [None]:
y_pred

In [None]:
y_test

# Model Validation

## Model Evaluation:

-MSE.

-RMSE.

-MAE.

-R2.

-Adjusted R2.

-Feature Importance.


In [None]:
result=pd.DataFrame({"y_test":y_test,"y_pred":y_pred})
result

In [None]:
sns.regplot(x='y_pred',y='y_test', data=result, scatter_kws={"color": "red"}, line_kws={"color": "black"})

In [None]:
mse(y_test,y_pred)

In [None]:
mae(y_test,y_pred)

In [None]:
R2=r2_score(y_test,y_pred)
R2

In [None]:
R2_test=model.score(X_test_scaled,y_test)
R2_train=model.score(X_train_scaled,y_train)

In [None]:
R2_test

In [None]:
R2_train

In [None]:
Adj_R2= 1 - (1-R2)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)
Adj_R2

In [None]:
features_importances = pd.DataFrame(data={
    'Attribute': X_train.columns,
    'Importance': abs(model.coef_)
})
features_importances = features_importances.sort_values(by='Importance', ascending=False)

features_importances