# Importing libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df= pd.read_csv(r'../input/vehicle-insurance-data/VehicleInsuranceData.csv')

In [None]:
df.head(5)

# Checking for missing values

In [None]:
sns.heatmap(df.isnull(), yticklabels=False,cbar=False, cmap='viridis')

# Check for oulliers

In [None]:
sns.boxplot(df.clv)

In [None]:
df= df[(df.clv>2500) & (df.clv < 15000)]     
# according to boxplot any data below or above, Q1 or Q3 respectively are outliers.

In [None]:
df.shape

# using encoder
 To convert categorical features into continuous features. This step is necessary for different statistical tools,
 transformation and model. 

In [None]:
from sklearn import preprocessing

encoder = preprocessing.LabelEncoder()

for i in df.columns:
    if isinstance(df[i][0], str):
            df[i] = encoder.fit_transform(df[i])

In [None]:
df.head(2)

# Calculating VIF and Correlation coefficients

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

X_vif = add_constant(df)

pd.Series([variance_inflation_factor(X_vif.values, i) 
               for i in range(X_vif.shape[1])], 
              index=X_vif.columns)

In [None]:
#get correlations of each features in dataset
corrmat = df.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20,20))
#plot heat map
g=sns.heatmap(df[top_corr_features].corr(),annot=True,cmap="RdYlGn")

# Features selection and recognizing the target(label)
we selected features based on VIF and correlation coefficients. For more details look in readme file.

In [None]:

X = df[['Coverage','Monthly.Premium.Auto','Number.of.Policies','Renew.Offer.Type','Total.Claim.Amount','Vehicle.Class']]

y = df['clv']

#  log transformation and normalization

In [None]:
drake= np.log(X+1)

In [None]:
from sklearn.preprocessing import StandardScaler 
  
scalar = StandardScaler() 
  
scalar.fit(drake) 
scaled_data = scalar.transform(drake) 

In [None]:
kiki = np.log(y)

In [None]:
scaled_data = pd.DataFrame(data=scaled_data, columns=['Coverage', 'Monthly.Premium.Auto', 'Number.of.Policies',
       'Renew.Offer.Type', 'Total.Claim.Amount', 'Vehicle.Class'])

# Test train split

In [None]:
from sklearn.model_selection import train_test_split


In [None]:
X_train, X_test, y_train, y_test = train_test_split(scaled_data , kiki, test_size=0.3, random_state=200)

# Applying different regression techniques


## linear regression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lm = LinearRegression()

In [None]:
lm.fit(X_train,y_train)

In [None]:
print('Training accuracy=',lm.score(X_train,y_train)*100)

In [None]:
pred = lm.predict(X_test)

In [None]:
from sklearn import metrics
from sklearn.metrics import accuracy_score
print('Prediction accuracy =',metrics.explained_variance_score(y_test, pred)*100)

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, pred))
print('MSE:', metrics.mean_squared_error(y_test, pred))
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test, pred)))

In [None]:
fig=plt.figure(figsize=(10,6))
plt.scatter(np.arange(1,100,10),pred[0:100:10],color='blue')
plt.scatter(np.arange(1,100,10),y_test[0:100:10],color='yellow')

plt.legend(['prediction','test'])

In [None]:
cdf = pd.DataFrame(lm.coef_,X.columns,columns=['coeff'])
cdf

# Polynomial Regression

In [None]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree = 3) 
X_poly = poly.fit_transform(scaled_data.iloc[:,0].values.reshape(-1,1)) 
  

lin2 = LinearRegression() 
lin2.fit(X_poly, y)

In [None]:
X_poly.shape

In [None]:
from sklearn.preprocessing import PolynomialFeatures 

def check_exp(inp,degree,out):
    
    poly = PolynomialFeatures(degree = degree) 
    X_poly = poly.fit_transform(inp) 


    lin2 = LinearRegression() 
    lin2.fit(X_poly, out)
    
    return lin2.score(X_poly, out)

for a in range(X.shape[1]):
    acc= []
    for i in range(10):
        acc.append(check_exp(X.iloc[:,a].values.reshape(-1,1), i, y))

    import matplotlib.pyplot as plt
    plt.figure(figsize=(4,X.shape[1]*2))
    sb = (X.shape[1]*10+1)*10+(a+1)
    plt.subplot(sb)
    plt.title('column : '+str(a))
    plt.xlabel('degrees')
    plt.ylabel('accuracy')
    plt.plot(acc)

In [None]:
poly=PolynomialFeatures(degree=1)
X_poly= poly.fit_transform(X.iloc[:,0].values.reshape(-1,1))  #0

In [None]:
poly=PolynomialFeatures(degree=1)
X_poly1= poly.fit_transform(X.iloc[:,1].values.reshape(-1,1))  #1

In [None]:
poly=PolynomialFeatures(degree=4)
X_poly2= poly.fit_transform(X.iloc[:,2].values.reshape(-1,1))  #2

In [None]:
poly=PolynomialFeatures(degree=2)
X_poly3= poly.fit_transform(X.iloc[:,3].values.reshape(-1,1))  #3

In [None]:
poly=PolynomialFeatures(degree=1)
X_poly4= poly.fit_transform(X.iloc[:,4].values.reshape(-1,1))  #4

In [None]:
poly=PolynomialFeatures(degree=2)
X_poly5= poly.fit_transform(X.iloc[:,5].values.reshape(-1,1))  #5

In [None]:
Xo = np.concatenate((X_poly,X_poly1,X_poly2,X_poly3,X_poly4,X_poly5), axis=1)

In [None]:
Xo.shape

In [None]:
Xo

In [None]:
X_train, X_test, y_train, y_test = train_test_split(Xo, kiki, test_size=0.33, random_state=42)

In [None]:
lm.fit(X_train,y_train)
print('Training score =',lm.score(X_train,y_train)*100,'%')

In [None]:
pred = lm.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
print ('Prediction accuracy =',metrics.explained_variance_score(y_test, pred)*100,'%')

In [None]:
cdf = pd.DataFrame(lm.coef_,columns=['coeff'])
cdf

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, pred))
print('MSE:', metrics.mean_squared_error(y_test, pred))
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test, pred)))

In [None]:
fig=plt.figure(figsize=(10,6))
plt.scatter(np.arange(1,100,10),pred[0:100:10],color='blue')
plt.scatter(np.arange(1,100,10),y_test[0:100:10],color='yellow')

plt.legend(['prediction','test'])

# XGBoost Regressor 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(scaled_data, kiki, test_size=0.3, random_state=42)

In [None]:
from xgboost import XGBRegressor

In [None]:
my_model = XGBRegressor(n_estimators=1000, learning_rate=0.1)
my_model.fit(X_train, y_train, early_stopping_rounds = 5,
             eval_set=[(X_train, y_train)], verbose=False)

In [None]:
my_model.score(X_train, y_train)*100

In [None]:
pred = my_model.predict(X_test)


In [None]:
print('Prediction accuracy =',metrics.explained_variance_score(y_test, pred)*100)

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, pred))
print('MSE:', metrics.mean_squared_error(y_test, pred))
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test, pred)))

In [None]:
fig=plt.figure(figsize=(10,6))
plt.scatter(np.arange(1,100,10),pred[0:100:10],color='blue')
plt.scatter(np.arange(1,100,10),y_test[0:100:10],color='yellow')

plt.legend(['prediction','test'])