In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Task 1 - Importing libraries and dataset 

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
plt.style.use('seaborn-deep')
plt.style.use('fivethirtyeight')
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.monospace'] = 'Ubunto Mono'
plt.rcParams['font.size'] = 10
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['axes.titlesize'] = 12
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['legend.fontsize'] = 12
plt.rcParams['figure.titlesize'] = 14
plt.rcParams['figure.figsize'] = (16,10)

import warnings
warnings.filterwarnings('ignore')
pd.options.mode.chained_assignment = None
pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.max_columns', 200)
pd.set_option('display.width', 400)

In [None]:
df = pd.read_csv('../input/fast-furious-and-insured/Fast_Furious_Insured/train.csv')
df

In [None]:
df_test = pd.read_csv('../input/fast-furious-and-insured/Fast_Furious_Insured/test.csv')
df_test

# Task 2 - Exploratory Data Analysis (EDA) 

## Dealing with classification subtask

In [None]:
## Let's have a look at some of the damaged vehicles

path = '../input/fast-furious-and-insured/Fast_Furious_Insured/trainImages/'
for i,im in enumerate(df[df['Condition'] == 1]['Image_path']):
    plt.subplot(2,5,i+1)
    img = plt.imread(path+str(im))
    plt.imshow(img)
    plt.xticks([])
    plt.yticks([])
    i+=1
    if i == 10:
        break
plt.show()

In [None]:
## Let's have a look at some of the Non-damaged vehicles

path = '../input/fast-furious-and-insured/Fast_Furious_Insured/trainImages/'
for i,im in enumerate(df[df['Condition'] == 0]['Image_path']):
    plt.subplot(2,5,i+1)
    img = plt.imread(path+str(im))
    plt.imshow(img)
    plt.xticks([])
    plt.yticks([])
    i+=1
    if i == 10:
        break
plt.show()

In [None]:
df['Condition'].value_counts()

From the above cell, we can see that there is imbalance between the instances of label 0 and label 1. To counter this problem we will firstly augment the image labelled 0 in separate directory to equalize the number of examples in both of classes.

In [None]:
os.makedirs('./data/augmented_class_0') ## Directory that will contain the augmented images of label 0
os.makedirs('./data/class_1')           ## Directory that will contain the original images of label 1
os.makedirs('./test/test_data')         ## Directory that will contain the given test images

In [None]:
## Saving the images labelled 1 in new directory as mentioned above

import cv2
for i in df[df['Condition'] == 1]['Image_path']:
    img = cv2.imread(path + str(i))
    cv2.imwrite('./data/class_1/'+str(i),img)

In [None]:
## Saving the test images in new directory as mentioned above

test = '../input/fast-furious-and-insured/Fast_Furious_Insured/testImages/'
for i in df_test['Image_path']:
    img = cv2.imread(test + str(i))
    cv2.imwrite('./test/test_data/'+str(i),img)

In [None]:
## Preparing to equalize the images between class 0 and class 1 by using data augmentation

import keras
from keras.preprocessing.image import ImageDataGenerator,img_to_array,load_img

datagen = ImageDataGenerator(
    rotation_range = 20,
    shear_range = 0.15,
    width_shift_range = 0.1,
    height_shift_range = 0.1,
    zoom_range = 0.2,
    featurewise_std_normalization=0.3,
    channel_shift_range = 0.3,
    fill_mode = 'nearest',
    horizontal_flip = True,
    vertical_flip = True
)
for j,im in enumerate(df[df['Condition'] == 0]['Image_path']):
    img = load_img(path + str(im))
    if img == None:
        continue
    x = img_to_array(img)
    x = x.reshape((1,)+x.shape)
    i = 0
    for batch in datagen.flow(x,batch_size=1,shuffle = True,save_to_dir ='./data/augmented_class_0/',save_prefix=j,save_format='jpg'):
        i = i + 1
        if i >= 14:
            break


In [None]:
train_path = './data/'
test_path = './test/'
import keras
from keras.preprocessing.image import ImageDataGenerator,img_to_array,load_img

datagentrain = ImageDataGenerator(
        rescale = 1/255.0,
        rotation_range=45,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.15,
        horizontal_flip=True,
        vertical_flip = True,
        fill_mode='nearest',
        validation_split = 0.25
)
datagentest = ImageDataGenerator(rescale=1/255.0)



train_set = datagentrain.flow_from_directory(directory=train_path,
                                                 target_size = (224,224),
                                                 batch_size = 96,
                                                 class_mode = 'binary',
                                                 #color_mode='grayscale',
                                                 subset="training",shuffle=True)

validation_set = datagentrain.flow_from_directory(directory=train_path,
                                                 target_size = (224,224),
                                                 batch_size = 32,
                                                 class_mode = 'binary',
                                                  #color_mode='grayscale',
                                                 subset="validation",shuffle=True)

test_set = datagentest.flow_from_directory(directory=test_path,target_size=(224,224),batch_size=1,class_mode=None,#color_mode='grayscale',
                                           shuffle=False)

Now the data has been setuped as you can see from the above cell.

## Using transfer learning to classify the images

In [None]:
from keras import Sequential
from keras.layers import Input,Flatten,Dense,Activation,Dropout,BatchNormalization,GlobalAveragePooling2D
import tensorflow as tf
from keras.models import Model
import tensorflow_hub as hub

### Loading resnet_50 pretrained model based on Resnet V1 50 architecture published by tensorflow

In [None]:
#classifier = tf.keras.Sequential([
#    hub.KerasLayer("https://tfhub.dev/google/tf2-preview/mobilenet_v2/classification/4", input_shape=(224,224,3))
#])
classifier = tf.keras.Sequential([
    hub.KerasLayer("https://tfhub.dev/tensorflow/resnet_50/classification/1",input_shape=(224,224,3))
])

### Model definition and summary

In [None]:
feature_extractor_model = "https://tfhub.dev/tensorflow/resnet_50/feature_vector/1"
#feature_extractor_model = "https://tfhub.dev/google/imagenet/inception_v3/feature_vector/5"
pretrained_model_without_top_layer = hub.KerasLayer(
    feature_extractor_model, input_shape=(224, 224, 3), trainable=False)
num_class = 2

classification_model = tf.keras.Sequential([
  pretrained_model_without_top_layer,
    Dropout(0.2),
    Dense(8,activation='relu'),
    Dense(1,activation='sigmoid')
])

classification_model.summary()

In [None]:
classification_model.compile(
  optimizer="adam",
  loss='binary_crossentropy',
  metrics=['AUC'])

h = classification_model.fit_generator(
      train_set,validation_data = validation_set,
                              epochs=80,
                              callbacks = [
                              keras.callbacks.EarlyStopping(monitor='val_loss',patience=10,mode='auto'),
                              keras.callbacks.ModelCheckpoint('./classification_model_{val_loss:.3f}.h5',
                              save_best_only = True,save_weights_only=False,
                              monitor='val_loss')
                              ]


)
classification_model.save('model.h5')
## If you are getting error of failed to file path 'val_loss', just re-run the cell 

In [None]:
## Model auc score and loss visualization
acc= h.history['auc']
val_acc=h.history['val_auc']
loss=h.history['loss']
val_loss=h.history['val_loss']

epochs=range(len(acc)) #No. of epochs

plt.figure(figsize=(8,5))
plt.plot(epochs,acc,'r',label='Training AUC score')
plt.plot(epochs,val_acc,'g',label='Testing AUC score')
plt.legend()
plt.xlabel('No. of epochs')
plt.ylabel('Accuracy score')

plt.figure(figsize=(8,5))
plt.plot(epochs,loss,'r',label='Training Loss')
plt.plot(epochs,val_loss,'g',label='Testing Loss')
plt.xlabel('No. of epochs')
plt.ylabel('Loss score')
plt.legend()
plt.show()

In [None]:
classification_model.evaluate(validation_set)

Now we are done with our classification task, we will be using this classification_model to predict the Condition of the test set given and then using regression_model, Amount will be predicted.

## Dealing with regression task

So, here is the deal to do,
Since we know that the Amount for all vehicle images is Zero whose Condition is labelled Zero i.e., Not damaged vehicle,so we will be fitting our regression_model ONLY for the data of positively labelled class and will be predicting the Amount for those only. The Amount for the data of negatively labeled class will simply Zero. So we will be separating data of labelled One from the combined data. Before moving to that part, let's explore the whole data.

In [None]:
df_test = pd.read_csv('../input/fast-furious-and-insured/Fast_Furious_Insured/test.csv')
df = pd.read_csv('../input/fast-furious-and-insured/Fast_Furious_Insured/train.csv')

In [None]:
corr = df.corr()
plt.figure(figsize=(10,10))
mask = np.zeros_like(corr,dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr,mask=mask,annot=True)
plt.show()

From the above correlation plot, we can see that features like (Cost_of_vehicle, Min_coverage) and (Max_coverage, Condition) are highly correlated, so in data preparation for regression_model, we will be selecting only one of them.

In [None]:
## Showing which feature got missing values
sns.heatmap(df.isnull(),cbar=False,yticklabels=False,cmap = 'viridis')

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.nunique()

#### Feature "Insurance_company"

In [None]:
df['Insurance_company'].value_counts()

In [None]:
x = list(df['Insurance_company'].value_counts().keys())
y = df['Insurance_company'].value_counts().values
sns.barplot(x= x,y=y)


### Feature "Cost_of_vehicle"

In [None]:
df['Cost_of_vehicle'].plot(kind='kde')

Rougly normal distribution to some extent.

### Feature "Min_coverage"

In [None]:
df['Min_coverage'].plot(kind='kde')

From the correlation plot, we concluded that feature cost_of_vehicle and Min_coverage are having correlation of 1 and that is what conveyed by above kde plots.

### Feature "Max_coverage"

In [None]:
df['Max_coverage'].plot(kind='kde')

Fairly normal distributed.

In [None]:
## scatterplot between feature 'Max_coverage' ,and feature 'Amount' with label of feature 'Condition'
sns.scatterplot(x ='Max_coverage' ,y='Amount',hue='Condition',data=df)

As mentioned earlier, Amount for data labelled Condition Zero is also zero and thus shown in scatterplot also. The amount for damaged vehicle is having some points clearly out of these clustered values so they will be considered as outliers. We will separately analyse the data of damaged vehicles.

In [None]:
df['Condition'].value_counts()

In [None]:
sns.boxplot(y='Amount',x='Condition',data=df)

In [None]:
df['Amount'].plot(kind='box')

In [None]:
## Removing Outliers

out = df['Amount'] > 12000
out = np.where(out)
print(np.shape(out))
df.drop(out[0],inplace=True)
df.index = range(df.shape[0])

In [None]:
df['Amount'].plot(kind='box')

In [None]:
df[df['Amount']<0]

In [None]:
df.drop(index = 641,inplace=True)

In [None]:
sns.pairplot(df,diag_kind='kde',hue='Condition')

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df[df['Amount'] != 0].isnull().sum()

Now we will deal only with the data of damaged vehicles and will fit a regression_model to predict Amount for them.

In [None]:
## Separating damaged vehicles data



target_df = df[df['Condition'] == 1]
target_df.index = range(target_df.shape[0])
target_df

In [None]:
target_df.info()

In [None]:
target_df.describe()

In [None]:
## Visulaizing the missing values

sns.heatmap(target_df.isnull(),cbar=False,yticklabels=False,cmap = 'viridis')

In [None]:
sns.pairplot(target_df)

In [None]:
corr = target_df.corr()
plt.figure(figsize=(10,10))
mask = np.zeros_like(corr,dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr,mask=mask,annot=True)
plt.show()

Now we have features (Cost_of_vehicle, Min_coverage, Max_coverage) higly correlated(correlation of 1). So we will be chosing only one of them in end.

In [None]:
plt.figure()
ax0 = plt.subplot(1,3,1)
ax1 = plt.subplot(1,3,2)
ax2 = plt.subplot(1,3,3)
target_df['Cost_of_vehicle'].plot(kind='kde',ax=ax0)
ax0.set_xlabel('Cost_of_vehicle')
target_df['Min_coverage'].plot(kind='kde',ax=ax1)
ax1.set_xlabel('Min_coverage')
target_df['Max_coverage'].plot(kind='kde',ax=ax2)
ax2.set_xlabel('Max_coverage')
plt.show()

From the above density plot, we can see that all these three features are having correlation of 1.

In [None]:
x = list(target_df['Insurance_company'].value_counts().keys())
y = target_df['Insurance_company'].value_counts().values
sns.barplot(x= x,y=y)


# Task 3 - Data Preparation for model evaluation

In [None]:
## Filling the missing values using mean
target_df['Amount'].fillna(target_df['Amount'].mean(),inplace=True)
target_df['Cost_of_vehicle'].fillna(target_df['Cost_of_vehicle'].mean(),inplace=True)
target_df['Min_coverage'].fillna(target_df['Min_coverage'].mean(),inplace=True)
target_df['Max_coverage'].fillna(target_df['Max_coverage'].mean(),inplace=True)

In [None]:
## Seleting the required features only

cols = ['Min_coverage','Amount','Insurance_company']
target_df = target_df[cols]

In [None]:
target_df

In [None]:
target_df.info()

Since we don't know the data of Insurance_company is ordinal or not. So we will be considering it as nominal data and using it's One hot encoding representation by dummy variables.

In [None]:
df_dum = pd.get_dummies(target_df['Insurance_company'])
target_df = pd.concat([target_df,df_dum],axis=1)
target_df

In [None]:
corr = target_df.corr()
plt.figure(figsize=(16,10))
mask = np.zeros_like(corr,dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr,mask=mask,annot=True)
plt.show()

In [None]:
target_df.info()

In [None]:
plt.figure()
ax0 = plt.subplot(1,2,1)
ax1 = plt.subplot(1,2,2)
target_df['Min_coverage'].plot(kind='box',ax=ax0)
target_df['Amount'].plot(kind='box',ax=ax1)


In [None]:
target_df.isnull().sum()

In [None]:
target_df.describe()

In [None]:
target_df.drop(['Insurance_company'],inplace=True,axis=1)

# Task 4 - Data modelling 

In [None]:
X = target_df.drop(['Amount'],axis=1)
Y = target_df[['Amount']]

In [None]:
## Scaling the data using StandardScaler
from sklearn.preprocessing import StandardScaler
scaler_x = StandardScaler()
scaler_y = StandardScaler()
X_scaled = scaler_x.fit_transform(X)
Y_scaled = scaler_y.fit_transform(Y)


In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X_scaled,Y_scaled,train_size=0.8,random_state=42)
print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import r2_score
lr = LinearRegression()
lr.fit(x_train,y_train)
y_train_pred = lr.predict(x_train)
y_test_pred = lr.predict(x_test)
print(r2_score(y_true=y_train,y_pred=y_train_pred))
print(r2_score(y_true=y_test,y_pred=y_test_pred))

### Lasso Regression

In [None]:
from sklearn.linear_model import LassoCV
lasso_model = LassoCV(alphas=[0.0001,0.0005,0.001,0.005,0.01,0.1,1.0,10],cv=5)
lasso_model.fit(x_train,y_train)
y_train_pred = lasso_model.predict(x_train)
y_test_pred = lasso_model.predict(x_test)
print(r2_score(y_true=y_train,y_pred=y_train_pred))
print(r2_score(y_true=y_test,y_pred=y_test_pred))

### Extra Tree Regressor

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
extra_model = ExtraTreesRegressor(criterion='mse', random_state=0, n_jobs=-1, 
                                min_samples_leaf=1, max_depth=8, 
                                min_samples_split=3, n_estimators=1000
                               )

extra_model.fit(x_train, y_train)

# predict
y_train_pred = extra_model.predict(x_train)
y_test_pred = extra_model.predict(x_test)
print(r2_score(y_train,y_train_pred))
print(r2_score(y_test,y_test_pred))

### Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(x_train,y_train)
y_train_pred = rf.predict(x_train)
y_test_pred = rf.predict(x_test)
print(r2_score(y_train,y_train_pred))
print(r2_score(y_test,y_test_pred))

### Gradient Boosting Regressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
gb_model = GradientBoostingRegressor(criterion='mse',random_state=0,max_depth=5,
                                     n_estimators=500,min_samples_split=2,min_samples_leaf=2)
gb_model.fit(x_train,y_train)
y_train_pred = gb_model.predict(x_train)
y_test_pred = gb_model.predict(x_test)
print(r2_score(y_train,y_train_pred))
print(r2_score(y_test,y_test_pred))

### XGB Regressor

In [None]:
from xgboost import XGBRegressor
xgb = XGBRegressor(n_estimators=500,max_depth=5,booster='gbtree',n_jobs=-1,learning_rate=0.1,reg_lambda=0.01,reg_alpha=0.3)
xgb.fit(x_train,y_train)
y_train_pred = xgb.predict(x_train)
y_test_pred = xgb.predict(x_test)
print(r2_score(y_train,y_train_pred))
print(r2_score(y_test,y_test_pred))

### SVR 

In [None]:
from sklearn.svm import SVR
regressor = SVR(kernel = 'rbf')
regressor.fit(x_train, y_train)
y_train_pred = regressor.predict(x_train)
y_test_pred = regressor.predict(x_test)
print(r2_score(y_train,y_train_pred))
print(r2_score(y_test,y_test_pred))

You might be wondering why all these models are having negative or low R2 score, so here is, why it is happening:

In [None]:
plt.plot(X['Min_coverage'],Y,'o')
plt.xlabel('Min_coverage')
plt.ylabel('Amount')
plt.show()

I believe the negative value or low value of R2 score is justified and i guess no model can fit over this noised data, and similary for other features as well, feature Amount is not showing any kind of correlation. If anyone is having good R2 score with other model or with some transformed form of data, please let me know in the comment section.

### Artificial Neural Network (ANN)

In [None]:
def plotHistory(history):
    print("Min. Validation MSE",min(history.history["val_mse"]))
    pd.DataFrame(history.history).plot(figsize=(12,6))
    plt.show()
callbacks_list = [keras.callbacks.EarlyStopping(monitor='val_loss',patience=10,mode='auto'),
                              keras.callbacks.ModelCheckpoint('./reg_model_{val_loss:.3f}.h5',
                              save_best_only = True,save_weights_only=False,
                              monitor='val_loss')]

In [None]:
epochs = 50
reg_model = Sequential([
    Dense(1024, activation='relu', input_shape=(x_train.shape[1],)),
    BatchNormalization(),
    Dropout(0.3),
    
    Dense(512, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),

    Dense(256, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),

    Dense(128, activation='relu'),
#     k.layers.BatchNormalization(),
    Dropout(0.2),

    Dense(1),
])
print(reg_model.summary())

reg_model.compile(optimizer='RMSProp',
              loss='mae',
              metrics='mse'
)
history = reg_model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=epochs,
                 callbacks=[callbacks_list], batch_size=512)

In [None]:
plotHistory(history)

# Task 5 - Model evaluation and submission

In [None]:
scaler_y.inverse_transform(xgb.predict(x_test))

In [None]:
df_test = pd.read_csv('../input/fast-furious-and-insured/Fast_Furious_Insured/test.csv')
df_test

In [None]:
corr = df_test.corr()
plt.figure(figsize=(10,10))
mask = np.zeros_like(corr,dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
sns.heatmap(corr,mask=mask,annot=True)
plt.show()

In [None]:
## Predicting the Condition label for test data using our classification_model

arr = classification_model.predict(test_set)
pred = []
for i in arr:
    if i >= 0.5:
        pred.append(1)
    else:
        pred.append(0)
            

In [None]:
np.shape(pred)

In [None]:
df_test['Condition'] = pred

In [None]:
df_test

In [None]:
df_test['Condition'].value_counts()

### Preparing the test data for regression_model

In [None]:
df_test = df_test[['Image_path','Insurance_company','Min_coverage','Condition']]
df_test

In [None]:
df_dum = pd.get_dummies(df_test['Insurance_company'])
df_test = pd.concat([df_test,df_dum],axis=1)
df_test

In [None]:
X_test = df_test.drop(['Image_path','Insurance_company','Condition'],axis=1)
X_test

In [None]:
X_test = scaler_x.transform(X_test)

In [None]:
predicted = scaler_y.inverse_transform(reg_model.predict(X_test))

In [None]:
df_test['Amount'] = predicted

In [None]:
df_test

In [None]:
## We know that the Amount value is zero for Non damaged vehicles and thus doing so.
for i in range(df_test.shape[0]):
    if df_test['Condition'][i] == 0:
        df_test['Amount'][i] = 0
        

In [None]:
df_test

In [None]:
sub = df_test[['Image_path','Condition','Amount']]
sub

In [None]:
sub.to_csv('./Submission.csv',header=True,index=False)

## Last Notes

The data was higly imbalanced and the image of damaged vehicles wer not also good, howsoever transfer learning works well. Since the data was imbalanced, I firstly augument the data of minority class to equalize the instances between the classes.This is done because the data augumentation generator at the compile time, don't treat imbalance problem, it just replicates the data in same ratio as there was when classes were imabalanced.
So we need to equalize the instances before using the data augmentation generator at compile time.
Then used data augumentation for whole data while feeding data to the model as classes are now balanced with equal number of examples. Accuracy metric for classification_model is not preferable, since it fails to tell us the exact performance of the model when data is imbalanced, so metric AUC has been used which is good in case of imbalance between two classes or simply we can use accuracy metric as well ONLY when the instances are approximately equal between the classes.
For the regression part, I firstly separated the data of condition label 0 and 1. And then fitted the regression_model over the data of damaged vehicles after excluding some highly correlated features. The R2 Score was low as mentioned above why it is happening. If anyone is having good R2 score with other model or with some transformed form of data, please let me know in the comment section.

This is best intution I have. If anyone is having better approaches or better analysis, please let me know.
Any kind of queries, improvements or feedbacks are most welcome.

If you like my work, please show your appreciation by upvoting the notebook. Thank you...............!!!!!!!