Disclaimer: I used images from Stanford Cars Dataset to help train the classification model since otherwise, the dataset would be way too inbalanced to train with. 

# **Import Libraries**

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import numpy as np
import pandas as pd
import time
import glob
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow
import os
import cv2
import shutil
import seaborn as sns
import xgboost
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
import category_encoders as ce
from sklearn.metrics import mean_absolute_error

# **Dataframes**

In [None]:
train_df = pd.read_csv('../input/fast-furious-and-insured/Fast_Furious_Insured/train.csv')
test_df = pd.read_csv('../input/fast-furious-and-insured/Fast_Furious_Insured/test.csv')
train_df, valid_df = train_test_split(train_df, train_size=0.95, random_state=0)

In [None]:
train_df.head()

In [None]:
train_df = train_df.rename(columns={'Image_path': 'file_path', 'Condition': 'labels'})

In [None]:
valid_df = valid_df.rename(columns={'Image_path': 'file_path', 'Condition': 'labels'})
test_df = test_df.rename(columns={'Image_path': 'file_path', 'Condition': 'labels'})

In [None]:
train_path = '../input/fast-furious-and-insured/Fast_Furious_Insured/trainImages/'
test_path = '../input/fast-furious-and-insured/Fast_Furious_Insured/testImages/'

In [None]:
train_df.file_path = train_path + train_df.file_path

In [None]:
train_df.head()

In [None]:
valid_df.file_path = train_path + valid_df.file_path
test_df.file_path = test_path + test_df.file_path

# **Balancing Classes in Train and Valid Dataframes**

In [None]:
train_df.labels.value_counts()

In [None]:
valid_df.labels.value_counts()

Dataset is extremely unbalanced. We will add images of cars from the Stanford Cars Dataset to combat this.  

In [None]:
train_class_diff = 1237-92
valid_class_diff = 63-7

In [None]:
labels = []
file_path = []
for file in glob.glob('../input/stanford-cars-dataset/cars_train/cars_train/*.jpg'):
    labels.append(0)
    file_path.append(file)
print ('file_path: ', len(file_path), '   labels: ', len(labels))

In [None]:
Fseries = pd.Series(file_path, name='file_path')
Lseries = pd.Series(labels, name='labels')
stanford_df = pd.concat([Fseries,Lseries], axis=1)
stanford_df = pd.DataFrame(np.array(stanford_df).reshape(8144,2), columns = ['file_path', 'labels'])
print(stanford_df['labels'].value_counts())

In [None]:
balance_images = stanford_df.sample(train_class_diff + valid_class_diff, random_state = 0)
balance_images.head()

In [None]:
train_balance = balance_images[:train_class_diff]
valid_balance = balance_images[-valid_class_diff:]
print(train_balance.size, valid_balance.size)

In [None]:
balanced_class_train_df = pd.concat([train_df, train_balance], axis=0)

In [None]:
balanced_class_train_df = balanced_class_train_df.reset_index(drop=True)

In [None]:
balanced_class_train_df.head()

In [None]:
balanced_class_train_df.tail()

In [None]:
balanced_class_valid_df = pd.concat([valid_df, valid_balance], axis=0)

In [None]:
balanced_class_valid_df = balanced_class_valid_df.reset_index(drop=True)

In [None]:
balanced_class_valid_df.head()

In [None]:
class_dict = {0: 'Undamaged', 1: 'Damaged'}

In [None]:
balanced_class_train_df.labels = balanced_class_train_df.labels.map(class_dict)

In [None]:
balanced_class_train_df.head()

In [None]:
balanced_class_valid_df.labels = balanced_class_valid_df.labels.map(class_dict)
balanced_class_valid_df.head()

In [None]:
print(balanced_class_train_df.labels.value_counts(),
      balanced_class_valid_df.labels.value_counts())

# **Visualize Images**

In [None]:
plt.figure(figsize = (14,10))
for i in range(20):
    random = np.random.randint(1,len(balanced_class_train_df))
    plt.subplot(4,5,i+1)
    plt.imshow(cv2.imread(balanced_class_train_df.loc[random,"file_path"]))
    plt.title(balanced_class_train_df.loc[random, "labels"], size = 10, color = "black") 
    plt.xticks([])
    plt.yticks([])
    
plt.show()

# **Image Data Generator**

In [None]:
target_size=(299,299)
batch_size=64

In [None]:
train_datagen = ImageDataGenerator(preprocessing_function=tf.keras.applications.inception_resnet_v2.preprocess_input, horizontal_flip=True, zoom_range=0.1)
test_datagen = ImageDataGenerator(preprocessing_function=tf.keras.applications.inception_resnet_v2.preprocess_input)
train_gen = train_datagen.flow_from_dataframe(balanced_class_train_df, x_col='file_path', y_col='labels', target_size=target_size, batch_size=batch_size, color_mode='rgb', class_mode='binary')
valid_gen = test_datagen.flow_from_dataframe(balanced_class_valid_df, x_col='file_path', y_col='labels', target_size=target_size, batch_size=batch_size, color_mode='rgb', class_mode='binary')

# **Classification Model**

In [None]:
base_model = tf.keras.applications.InceptionResNetV2(include_top=False, input_shape = (299,299,3))

In [None]:
classification_model = tf.keras.Sequential([
    base_model,
    tf.keras.layers.GlobalAveragePooling2D(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.BatchNormalization(), 
    tf.keras.layers.Dropout(0.2), 
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [None]:
lr=0.001
classification_model.compile(loss='binary_crossentropy', optimizer=Adam(lr=lr), metrics=['accuracy'])

# **Callbacks for Classification Model**

In [None]:
patience = 1
stop_patience = 3
factor = 0.5

callbacks = [
    tf.keras.callbacks.ModelCheckpoint("classify_model.h5", save_best_only=True, verbose = 0),
    tf.keras.callbacks.EarlyStopping(patience=stop_patience, monitor='val_loss', verbose=1),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=factor, patience=patience, verbose=1)
]

# **Classification Model Training**

In [None]:
epochs = 30
history = classification_model.fit(train_gen, validation_data=valid_gen, epochs=epochs, callbacks=callbacks, verbose=1)

# **Model Training Graphs**

In [None]:
plt.plot(history.history['loss'], label='Loss (training data)')
plt.plot(history.history['val_loss'], label='Loss (validation data)')
plt.title('Loss for Training')
plt.ylabel('Loss')
plt.xlabel('No. epoch')
plt.legend(['train', 'validation'], loc="upper left")
plt.show()
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

# **Best Classification Model Predictions on Valid Set**

In [None]:
best_class_model = classification_model
best_class_model.load_weights('./classify_model.h5')
best_class_model.evaluate(valid_gen)

Pretty decent accuracy. 

# **XG Boost Model**

Dataframe Preprocessing

In [None]:
train_df.head()

Need to encode labels since we cant have strings. I took this part from [this notebook.](https://www.kaggle.com/sohommajumder21/resnet50v2-and-randomforestregresion)

In [None]:
train_df['Insurance_company'] = LabelEncoder().fit_transform(train_df['Insurance_company'])
test_df['Insurance_company'] = LabelEncoder().fit_transform(test_df['Insurance_company'])

In [None]:
train_df.head()

In [None]:
train_df.Expiry_date = train_df.Expiry_date.apply(pd.to_datetime)
train_df['month'] = train_df.Expiry_date.apply(lambda x: x.month)
train_df['day'] = train_df.Expiry_date.apply(lambda x: x.day)
train_df['year'] = train_df.Expiry_date.apply(lambda x: x.year)
train_df.drop(['Expiry_date'], 1, inplace = True)

test_df.Expiry_date = test_df.Expiry_date.apply(pd.to_datetime)
test_df['month'] = test_df.Expiry_date.apply(lambda x: x.month)
test_df['day'] = test_df.Expiry_date.apply(lambda x: x.day)
test_df['year'] = test_df.Expiry_date.apply(lambda x: x.year)
test_df.drop(['Expiry_date'], 1, inplace = True)

In [None]:
train_df.head()

Finding important features

In [None]:
features = train_df[['Cost_of_vehicle', 'Insurance_company', 'Min_coverage', 'Max_coverage', 'month', 'day', 'year']]

In [None]:
ax = sns.heatmap(features)

In [None]:
X = train_df[['Cost_of_vehicle', 'Max_coverage', 'Min_coverage']]
y = train_df['Amount']

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.95, random_state=0)

# **Training XGBoost Model**

In [None]:
regression_model = XGBRegressor(n_estimators=1000, learning_rate=0.001)
regression_model.fit(X_train, y_train, early_stopping_rounds=5, eval_set=[(X_valid, y_valid)], verbose=False)

In [None]:
predictions = regression_model.predict(X_valid)
print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_valid)))

First time using machine learning in a project, so bad results as expected :)