In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.compose import make_column_transformer
import tensorflow as tf
import albumentations
import os
import cv2
from sklearn.utils import resample

In [None]:
train=pd.read_csv("../input/fast-furious-and-insured/Fast_Furious_Insured/train.csv")
test=pd.read_csv("../input/fast-furious-and-insured/Fast_Furious_Insured/test.csv")

In [None]:
train.head()

In [None]:
train.info()

* Image_path:- Path of Image in Fast_Furious_Insured Folder.

* Insurance_company:- Represents masked values of some insurance companies.

* Cost of Vehicle:- Represents the cost of a vehicle present in the image.

* Min_coverage:- Represents the minimum coverage provided by an insurance company.

* Expiry_date:- Represents the expiry date of the insurance.

* Max_coverage:- Represents the maximum coverage provided by an insurance company.

* Condition:-Represents whether a vehicle is damaged

* Amount:- Represents the insurance amount of a vehicle

In [None]:
train.describe()

In [None]:
sns.pairplot(train)

* The Condition column  is highly Imbalanced 

* There is a clear Decision Boundry based on Max_coverage which seprates 2 classes.

In [None]:
train.loc[train.Amount>train.Cost_of_vehicle]

This Amount is clear a mistake because Insurance Amount can never be greather then Cost_of_vehicle

In [None]:
train.loc[train['Amount']>train['Cost_of_vehicle'],'Amount']=train.Amount.median()

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(train.corr(),annot=True)

In [None]:
train.loc[(train.Max_coverage>13460)]['Condition'].value_counts()

In [None]:
test['Condition']=test['Max_coverage'].apply(lambda x: 0 if x>13460 else 1)

In [None]:
test['Condition'].value_counts()

In [None]:
train=train.fillna(method='bfill',axis=0).fillna(0)
features_num = ['Cost_of_vehicle', 'Min_coverage', 'Max_coverage']
features_cat = ['Insurance_company']
le= LabelEncoder()   
train['Insurance_company'] = le.fit_transform(train['Insurance_company'])
test['Insurance_company'] = le.transform(test['Insurance_company'])
preprocessor = make_column_transformer(
    (StandardScaler(), features_num),
)

y = train['Amount']
train_imputed = train.loc[:,['Cost_of_vehicle', 'Min_coverage', 'Max_coverage', 'Insurance_company']]
X = preprocessor.fit_transform(train_imputed)

test_imputed = test.loc[:,['Cost_of_vehicle', 'Min_coverage',  'Max_coverage', 'Insurance_company']]
test_X = preprocessor.transform(test_imputed)
train_X, val_X, train_y, val_y = train_test_split(X,y,random_state=1,test_size=0.2)
rf_model = RandomForestRegressor(random_state=1, n_estimators = 1000, max_depth=3)
rf_model.fit(train_X,train_y)
val_preds = rf_model.predict(val_X)
rf_val_mae = mean_absolute_error(val_y,val_preds)

print("Validation MAE {}".format(rf_val_mae))

In [None]:
test_Y=rf_model.predict(test_X)

In [None]:
sample_submission=pd.read_csv("../input/thesese/submission.csv")
sample_submission['Condition']=test['Condition']
sample_submission['Amount']=test_Y

In [None]:
sample_submission.to_csv("submission.csv",index=False)

# DEEP LEARNING  MODEL

In [None]:
train=pd.read_csv("../input/fast-furious-and-insured/Fast_Furious_Insured/train.csv")
test=pd.read_csv("../input/fast-furious-and-insured/Fast_Furious_Insured/test.csv")
train_folder="../input/fast-furious-and-insured/Fast_Furious_Insured/trainImages/"
test_folder="../input/fast-furious-and-insured/Fast_Furious_Insured/testImages/"
batch_size=32
img_size=224


In [None]:
class_0=train[train['Condition']==0]
class_1=train[train['Condition']==1]
class_0=resample(class_0,n_samples=len(class_1),random_state=42)
df=pd.concat((class_0,class_1))
df=df.sample(frac=1)

augment=albumentations.Compose([
    albumentations.HorizontalFlip(),
    albumentations.VerticalFlip(),
    albumentations.ShiftScaleRotate(),
    albumentations.HueSaturationValue(hue_shift_limit=10,sat_shift_limit=10,val_shift_limit=10),
    albumentations.RandomBrightnessContrast(),
    albumentations.OneOf([
        albumentations.GaussNoise(),
        albumentations.GaussianBlur(),
        albumentations.MotionBlur()
    ],p=0.3)
    
    
])
X=np.zeros((len(df),img_size,img_size,3))
y=np.zeros((len(df),1))
X_test=np.zeros((len(test),img_size,img_size,3))
a=0
for i,j in df.iterrows():
    path=os.path.join(train_folder,j['Image_path'])
    image=cv2.imread(path)
    if(j['Condition']==0):
        image=augment(image=image)['image']
    image=cv2.resize(image,(img_size,img_size))
    image=image.astype("float32")/255.0
    X[a,:]=image
    y[a,:]=j['Condition']
    a+=1
a=0    
for i,j in test.iterrows():
    path=os.path.join(test_folder,j['Image_path'])
    image=cv2.imread(path)
    image=cv2.resize(image,(img_size,img_size))
    image=image.astype("float32")/255.0
    X_test[a,:]=image
    a+=1
X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.2,random_state=42)
   

In [None]:
print(len(y_train[y_train==1]))
print(len(y_train[y_train==0]))

In [None]:
LR_START = 0.00001
LR_MAX = 0.00005
LR_MIN = 0.00001
LR_RAMPUP_EPOCHS = 5
LR_SUSTAIN_EPOCHS = 0
LR_DECAY = .8

def lr_schedule(epoch):
    if epoch < LR_RAMPUP_EPOCHS:
        lr = (LR_MAX - LR_START) / LR_RAMPUP_EPOCHS * epoch + LR_START
    elif epoch < LR_RAMPUP_EPOCHS + LR_SUSTAIN_EPOCHS:
        lr = LR_MAX
    else:
        lr = (LR_MAX - LR_MIN) * LR_DECAY**(epoch - LR_RAMPUP_EPOCHS - LR_SUSTAIN_EPOCHS) + LR_MIN
    return lr


lr_callback = tf.keras.callbacks.LearningRateScheduler(lr_schedule, verbose = True)
es=tf.keras.callbacks.EarlyStopping(patience=3)

In [None]:
Model=tf.keras.applications.DenseNet169(input_shape=(img_size,img_size,3),include_top=False)
Model.trainable=True
model=tf.keras.models.Sequential([
    Model,
    tf.keras.layers.GlobalAveragePooling2D(),
    tf.keras.layers.Dense(256,activation='relu'),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(1,activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=[tf.keras.metrics.BinaryAccuracy()])

In [None]:
model.fit(X_train,y_train,epochs=50,validation_data=(X_val,y_val),callbacks=[lr_callback,es])

In [None]:
history=pd.DataFrame(model.history.history)
plt.figure(figsize=(8,8))
plt.subplot(1,2,1)
plt.plot(history[['loss','val_loss']])
plt.subplot(1,2,2)
plt.plot(history[['binary_accuracy','val_binary_accuracy']])

In [None]:
predictions=model.predict(X_test)
predictions=np.where(predictions>0.5,1,0)
sample_submission['Condition']=predictions
sample_submission.to_csv("submission.csv",index=False)