In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [None]:
corr = df.corr()
plt.figure(figsize=(10,10))
mask = np.zeros_like(corr,dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr,mask=mask,annot=True)
plt.show()

In [None]:
## Showing which feature got missing values
sns.heatmap(df.isnull(),cbar=False,yticklabels=False,cmap = 'viridis')

In [None]:
x = list(df['Insurance_company'].value_counts().keys())
y = df['Insurance_company'].value_counts().values
sns.barplot(x= x,y=y)

In [None]:
df['Cost_of_vehicle'].plot(kind='kde')

In [None]:
df['Min_coverage'].plot(kind='kde')


In [None]:
df['Max_coverage'].plot(kind='kde')

In [None]:
## scatterplot between feature 'Max_coverage' ,and feature 'Amount' with label of feature 'Condition'
sns.scatterplot(x ='Max_coverage' ,y='Amount',hue='Condition',data=df)

In [None]:
df['Condition'].value_counts()

In [None]:
sns.boxplot(y='Amount',x='Condition',data=df)

In [None]:
df['Amount'].plot(kind='box')

In [None]:
## Removing Outliers

out = df['Amount'] > 12000
out = np.where(out)
print(np.shape(out))
df.drop(out[0],inplace=True)
df.index = range(df.shape[0])

In [None]:
df['Amount'].plot(kind='box')

In [None]:
df[df['Amount']<0]

In [None]:
df.drop(index = 641,inplace=True)

In [None]:
sns.pairplot(df,diag_kind='kde',hue='Condition')

In [None]:
df.info()

In [None]:
df[df['Amount'] != 0].isnull().sum()

# Now we will deal only with the data of damaged vehicles and will fit a regression_model to predict Amount for them.

In [None]:
## Separating damaged vehicles data



target_df = df[df['Condition'] == 1]
target_df.index = range(target_df.shape[0])
target_df

In [None]:
target_df.info()

In [None]:
sns.heatmap(target_df.isnull(),cbar=False,yticklabels=False,cmap = 'viridis')

In [None]:
sns.pairplot(target_df)

In [None]:
corr = target_df.corr()
plt.figure(figsize=(10,10))
mask = np.zeros_like(corr,dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr,mask=mask,annot=True)
plt.show()

# Now we have features (Cost_of_vehicle, Min_coverage, Max_coverage) higly correlated(correlation of 1). So we will be chosing only one of them in end.

In [None]:
target_df.corr().abs()['Amount'].sort_values(ascending = False)

In [None]:
plt.figure()
ax0 = plt.subplot(1,3,1)
ax1 = plt.subplot(1,3,2)
ax2 = plt.subplot(1,3,3)
target_df['Cost_of_vehicle'].plot(kind='kde',ax=ax0)
ax0.set_xlabel('Cost_of_vehicle')
target_df['Min_coverage'].plot(kind='kde',ax=ax1)
ax1.set_xlabel('Min_coverage')
target_df['Max_coverage'].plot(kind='kde',ax=ax2)
ax2.set_xlabel('Max_coverage')
plt.show()

In [None]:
x = list(target_df['Insurance_company'].value_counts().keys())
y = target_df['Insurance_company'].value_counts().values
sns.barplot(x= x,y=y)

In [None]:
## Filling the missing values using mean
target_df['Amount'].fillna(target_df['Amount'].mean(),inplace=True)
target_df['Cost_of_vehicle'].fillna(target_df['Cost_of_vehicle'].mean(),inplace=True)
target_df['Min_coverage'].fillna(target_df['Min_coverage'].mean(),inplace=True)
target_df['Max_coverage'].fillna(target_df['Max_coverage'].mean(),inplace=True)

In [None]:
## Seleting the required features only

cols = ['Min_coverage','Amount','Insurance_company']
target_df = target_df[cols]

In [None]:
target_df

In [None]:
df_dum = pd.get_dummies(target_df['Insurance_company'])
target_df = pd.concat([target_df,df_dum],axis=1)
target_df

In [None]:
corr = target_df.corr()
plt.figure(figsize=(16,10))
mask = np.zeros_like(corr,dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr,mask=mask,annot=True)
plt.show()

In [None]:
plt.figure()
ax0 = plt.subplot(1,2,1)
ax1 = plt.subplot(1,2,2)
target_df['Min_coverage'].plot(kind='box',ax=ax0)
target_df['Amount'].plot(kind='box',ax=ax1)

In [None]:
target_df.drop(['Insurance_company'],inplace=True,axis=1)

In [None]:
X = target_df.drop(['Amount'],axis=1)
Y = target_df[['Amount']]

In [None]:
## Scaling the data using StandardScaler
from sklearn.preprocessing import StandardScaler
scaler_x = StandardScaler()
scaler_y = StandardScaler()
X_scaled = scaler_x.fit_transform(X)
Y_scaled = scaler_y.fit_transform(Y)

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X_scaled,Y_scaled,train_size=0.8,random_state=42)
print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
rf = RandomForestRegressor()
rf.fit(x_train,y_train)
y_train_pred = rf.predict(x_train)
y_test_pred = rf.predict(x_test)
print(r2_score(y_train,y_train_pred))
print(r2_score(y_test,y_test_pred))

In [None]:
from xgboost import XGBRegressor
xgb = XGBRegressor(n_estimators=1000,max_depth=5,booster='gbtree',n_jobs=-1,learning_rate=0.1,reg_lambda=0.01,reg_alpha=0.3)
xgb.fit(x_train,y_train)
y_train_pred = xgb.predict(x_train)
y_test_pred = xgb.predict(x_test)
print(r2_score(y_train,y_train_pred))
print(r2_score(y_test,y_test_pred))

# Artificial Neural Network (ANN)

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from tensorflow.keras.layers import BatchNormalization

In [None]:
def plotHistory(history):
    print("Min. Validation MSE",min(history.history["val_mse"]))
    pd.DataFrame(history.history).plot(figsize=(12,6))
    plt.show()
callbacks_list = [keras.callbacks.EarlyStopping(monitor='val_loss',patience=10,mode='auto'),
                              keras.callbacks.ModelCheckpoint('./reg_model_{val_loss:.3f}.h5',
                              save_best_only = True,save_weights_only=False,
                              monitor='val_loss')]

In [None]:
epochs = 50
reg_model = Sequential([
    Dense(1024, activation='relu', input_shape=(x_train.shape[1],)),
    BatchNormalization(),
    Dropout(0.3),
    
    Dense(512, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),

    Dense(256, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),

    Dense(128, activation='relu'),
#     k.layers.BatchNormalization(),
    Dropout(0.2),

    Dense(1),
])
print(reg_model.summary())

reg_model.compile(optimizer='RMSProp',
              loss='mae',
              metrics='mse'
)
history = reg_model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=epochs,
                 callbacks=[callbacks_list], batch_size=512)

In [None]:
plotHistory(history)

In [None]:
scaler_y.inverse_transform(xgb.predict(x_test))

In [None]:
df_test = pd.read_csv('Condition_out.csv')

In [None]:
df_test

In [None]:
corr = df_test.corr()
plt.figure(figsize=(10,10))
mask = np.zeros_like(corr,dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr,mask=mask,annot=True)
plt.show()

In [None]:
df_test = df_test[['Image_path','Insurance_company','Min_coverage','Condition']]
df_test

In [None]:
df_dum = pd.get_dummies(df_test['Insurance_company'])
df_test = pd.concat([df_test,df_dum],axis=1)
df_test

In [None]:
X_test = df_test.drop(['Image_path','Insurance_company','Condition'],axis=1)
X_test

In [None]:
X_test = scaler_x.transform(X_test)

In [None]:
predicted = scaler_y.inverse_transform(reg_model.predict(X_test))

In [None]:
df_test['Amount'] = predicted

In [None]:
df_test

In [None]:
## We know that the Amount value is zero for Non damaged vehicles and thus doing so.
for i in range(df_test.shape[0]):
    if df_test['Condition'][i] == 0:
        df_test['Amount'][i] = 0
        

In [None]:
df_test

In [None]:
sub = df_test[['Image_path','Condition','Amount']]
sub

In [None]:
sub.to_csv('Submission.csv',header=True,index=False)