In [None]:
# Fast, Furious and Insured - HackerEarth ML Competition
## This competition is quite special, as to crack the top 1% of the leader you need a small trick, 
## that does not involve any fancy model or a very strong feature generation. It is more of a "crack" in the dataset. 
## Everything is documented using comments and markdown cells. 
### If this helps you in learning, an upvote would be huge!

In [None]:
import pandas as pd
import numpy as np
import cv2
import os
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import datetime


sns.set_style('whitegrid')
plt.style.use('fivethirtyeight')
plt.rcParams['font.size'] = 10
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['axes.titlesize'] = 12
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['legend.fontsize'] = 12
plt.rcParams['figure.titlesize'] = 14
plt.rcParams['figure.figsize'] = (16,10)

In [None]:
train = pd.read_csv("../input/fast-furious-and-insured/Fast_Furious_Insured/train.csv")

### Few train pictures

In [None]:
path = '../input/fast-furious-and-insured/Fast_Furious_Insured/trainImages/'
for i,im in enumerate(train.loc[train['Condition']==0,'Image_path']):
    if i < 0:
       continue
    plt.subplot(4,5,i+1)
    img = plt.imread(path+str(im))
    plt.title(str(im)+str(train[train.Image_path==im]['Condition']) )
    plt.imshow(img)
    plt.xticks([])
    plt.yticks([])
    i+=1
    if i == 20:
        break
plt.show()

In [None]:
test = pd.read_csv('../input/fast-furious-and-insured/Fast_Furious_Insured/test.csv')
path = 'testImages/'

## After picking apart the train data, I found that the pictures are not very much related to the condition. Moreover, copies of the same picture present in the train data have different Conditions (0=undamaged, 1=damaged).
## Candidly, the dataset doesn't make sense at all, with same pictures having different target values. Most of the pictures labeled as "damaged" are mere car wallpapers which are in no way "damaged". 

## Still, I gave a shot at manually labelling them by examining the train and test dataset, as expecting any machine learning model to perform better than human understanding, keeping in mind how less this dataset makes sense is not a viable idea. This took me around 2 hours.

## However, this did not even get me anywhere close to the top of the leaderboard. The magic is after a few cells with a proper reasoning gained from an insight.

In [None]:
Test_Condition = [1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
                  0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
                  1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 
                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 
                  1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
                  1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 
                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 
                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 
                  1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 
                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
                  1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
                  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


test['Condition'] = Test_Condition

In [None]:
path = '../input/fast-furious-and-insured/Fast_Furious_Insured/testImages/'
for i,im in enumerate(test.loc[test['Condition']==0, 'Image_path']):
    if i < 0:
       continue
    plt.subplot(7,5,i+1-0)
    img = plt.imread(path+str(im))
    plt.title(str(test[test.Image_path==im].index))
    plt.imshow(img)
    plt.xticks([])
    plt.yticks([])
    i+=1
    if i == 35:
        break
plt.show()

In [None]:
print(train.Condition.value_counts(),'\n',test.Condition.value_counts())

## Dataset is highly skewed.
## Has less than 10 percent of not damaged vehicles.

In [None]:
train[train['Condition']==0]['Amount'].value_counts()

### As expected, Not damaged cars have Amount = 0

In [None]:
corr = abs(train.corr())
plt.figure(figsize = (7,5))
sns.heatmap(corr, annot=True)

## There is high correlation between (Min_coverage-Cost_of_Vehicle), (Max_Coverage-Cost_of_vehicle)

In [None]:
train['Expiry_date'] = train['Expiry_date'].astype('datetime64')
test['Expiry_date'] = test['Expiry_date'].astype('datetime64')

In [None]:
reference = datetime.datetime(2021, 5, 4, 0, 0, 0)  #Taking 4th May 2021 as reference
train['months_remaining'] = train.Expiry_date.apply(lambda x: ((x-reference).days)/30)
test['months_remaining'] = test.Expiry_date.apply(lambda x: ((x-reference).days)/30)
train['years_remaining'] = train.Expiry_date.apply(lambda x: ((x-reference).days)/365)
test['years_remaining'] = test.Expiry_date.apply(lambda x: ((x-reference).days)/365)

In [None]:
train['month'] = pd.DatetimeIndex(train['Expiry_date']).month
test['month'] = pd.DatetimeIndex(test['Expiry_date']).month

In [None]:
train.drop('Expiry_date', axis=1, inplace = True)
test.drop('Expiry_date', axis=1, inplace = True)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 4))

plt.subplot(1, 2, 1) 
sns.scatterplot(x='Max_coverage', y='Cost_of_vehicle', data=train, hue='Insurance_company')
plt.xlabel('Max coverage')
plt.ylabel('Cost of vehicle')
plt.title('Train data')

plt.subplot(1, 2, 2) 
sns.scatterplot(x='Max_coverage', y='Cost_of_vehicle', data=test, hue='Insurance_company')
plt.xlabel('Max coverage')
plt.ylabel('Cost of vehicle')
plt.title('Test data')
plt.show()

## A notable insight here is that Max coverage seems to have 2 categories(for all insurance companies as seen bu the hue on the plot.), and based on those categories, Max coverage is decided in its entirety. 
## This is used to generate a new feature which will describe the category of the insurance within the insurance company.

In [None]:
train['insurance_category'] = (train['Max_coverage']>19999).astype(int)
test['insurance_category'] = (test['Max_coverage']>19999).astype(int)

In [None]:
train.loc[train['Max_coverage']>20000]['Condition'].value_counts()

## *The above cell shows that out of the two clearly distinct Max_coverage "packages", **all** the expensive one have condition = 0. This constitutes the entire non damaged cars of the dataset.* 
## This insight was used to have a *perfect* r2 score(for Condition) for test data. Hence the test data conditions are as shown below. This was the pivotal aspect of this competition for my approach. 

In [None]:
test.loc[test['Max_coverage']>20000,'Condition'] = 0     #When Max_coverage greater than 20,000 the condition is 0. 
test.loc[test['Max_coverage']<=20000,'Condition'] = 1

## Below we see that there is no such case in Min coverage.

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 4))

plt.subplot(1, 2, 1) 
sns.scatterplot(x='Min_coverage', y='Cost_of_vehicle', data=train, hue='Insurance_company')
plt.xlabel('Max coverage')
plt.ylabel('Cost of vehicle')
plt.title('Train data')

plt.subplot(1, 2, 2) 
sns.scatterplot(x='Min_coverage', y='Cost_of_vehicle', data=test, hue='Insurance_company')
plt.xlabel('Max coverage')
plt.ylabel('Cost of vehicle')
plt.title('Test data')
plt.show()

## We see that there is an outlier, with cost of vehicle as ~80,000. Let's take a look at this vehicle.

In [None]:
im = test.loc[test['Cost_of_vehicle']>70000]['Image_path'].iloc[0]
plt.figure(figsize=(10,5))
path = '../input/fast-furious-and-insured/Fast_Furious_Insured/testImages/'
img = plt.imread(path+str(im))
plt.title(str(test[test.Image_path==im].index))
plt.imshow(img)
plt.show

## We can see that the car is more of an exotic supercar. Hence high cost of vehicle and max coverage makes sense. 

<hr>

In [None]:
train.loc[train['Amount']>50000, ['Amount', 'Cost_of_vehicle']]

## It is highly improbable that amount is greater than the vehicle cost itself. Thus there is an error in this entry and is replaced by the mean of it. 

In [None]:
train.loc[train['Amount']>50000, 'Amount'] = train['Amount'].mean()

## Used DTale python library for EDA throughout.

In [None]:
# import dtale
# dtale.show(train)

In [None]:
#dtale.show(test)

In [None]:
means = train.groupby('Insurance_company').Condition.mean()
train['Insurance_mean'] = train['Insurance_company'].map(means)
test['Insurance_mean'] = test['Insurance_company'].map(means)

train.drop('Insurance_company', axis = 1, inplace = True)
test.drop('Insurance_company', axis = 1, inplace = True)

# Filling Nans

In [None]:
train.isnull().sum()

In [None]:
train.loc[train['Amount'].isnull()==True]

In [None]:
train['Amount'].fillna(train.Amount.mean(), inplace = True)

In [None]:
index_drop = train.loc[train.isnull().sum(axis=1)>2].index
train.drop(index_drop, inplace = True)

In [None]:
print(train.isnull().sum().sum(), test.isnull().sum().sum())

# Model

# CatBoost Regressor 

### Output of the top 3 Catboost models are averaged to get the final result, which gives an r2 value of 57.90049

In [None]:
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import sklearn.metrics as metrics 
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from sklearn.metrics import r2_score, f1_score

In [None]:
X = train.loc[train['Condition']==1, ['Insurance_mean', #'years_remaining',#'Min_coverage',
                                      'Cost_of_vehicle', 'Max_coverage',#'insurance_category',
                                      'months_remaining','Amount']]

y = X['Amount']
X.drop('Amount', axis=1, inplace = True)

bins = np.linspace(0, 1213, 10)
y_binned = np.digitize(y, bins)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=0, stratify=y_binned)

## Unfortunately I did not save the parameters of the best model. Luckily I could download the submission file(q.txt) from competition's submission history. 

In [None]:
X = train.loc[train['Condition']==1, ['Insurance_mean', #'years_remaining',#'Min_coverage',
                                      'Cost_of_vehicle', 'Max_coverage',#'insurance_category',
                                      'months_remaining','Amount']]

y = X['Amount']
X.drop('Amount', axis=1, inplace = True)

bins = np.linspace(0, 1213, 10)
y_binned = np.digitize(y, bins)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=0)

xtest = test[['Insurance_mean', 'Cost_of_vehicle'
              ,'Max_coverage'#'years_remaining','insurance_category','Min_coverage',
              ,'months_remaining']]

q = pd.read_csv('../input/best-fast-and-furious/q.txt')
pred = q.Amount
test['Amount0'] = pred

## The next 2 models are similar to the first one with differend parameters. 

In [None]:
parameters = [{'colsample_bylevel': [0.6466713862921843],
  'learning_rate': [0.1736278231552765],
  'max_depth': [7.0],
  'reg_lambda': [0.48672197601214473], 
               'n_estimators' : [400]},   #57.503
    
              
               {'colsample_bylevel': [0.5880321740388832],
  'learning_rate': [0.17389665784108996],
  'max_depth': [5.0],
  'reg_lambda': [0.35318535381932364], 
                'n_estimators' : [300]},   #57.458
              
]

In [None]:
X = train.loc[train['Condition']==1, ['Insurance_mean', #'years_remaining',#'Min_coverage',
                                      'Cost_of_vehicle', 'Max_coverage',#'insurance_category',
                                      'months_remaining','Amount']]

y = X['Amount']
X.drop('Amount', axis=1, inplace = True)

bins = np.linspace(0, 1213, 10)
y_binned = np.digitize(y, bins)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=0)

xtest = test[['Insurance_mean', 'Cost_of_vehicle'
              ,'Max_coverage'#'years_remaining','insurance_category','Min_coverage',
              ,'months_remaining']]

i = 0
clf = CatBoostRegressor(loss_function='RMSE',
                            eval_metric="R2", 
                            random_seed=14,
                            colsample_bylevel = parameters[i]['colsample_bylevel'][0],
                            learning_rate = parameters[i]['learning_rate'][0], 
                            max_depth = parameters[i]['max_depth'][0],
                            reg_lambda = parameters[i]['reg_lambda'][0],
                            n_estimators = parameters[i]['n_estimators'][0])
    
    
evaluation = [( X_train, y_train), ( X_test, y_test)]
    
clf.fit(X_train, y_train, 
            #early_stopping_rounds=10,
            eval_set=evaluation,
            verbose=False)

pred = clf.predict(xtest)
test['Amount1'] = pred
test.loc[test['Condition']==0, 'Amount1'] = 0

In [None]:
X = train.loc[train['Condition']==1, ['Insurance_mean', #'years_remaining',#'Min_coverage',
                                      'Cost_of_vehicle', 'Max_coverage',#'insurance_category',
                                      'months_remaining','Amount']]

y = X['Amount']
X.drop('Amount', axis=1, inplace = True)

bins = np.linspace(0, 1213, 10)
y_binned = np.digitize(y, bins)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=0)

xtest = test[['Insurance_mean', 'Cost_of_vehicle'
              ,'Max_coverage'#'years_remaining','insurance_category','Min_coverage',
              ,'months_remaining']]


i = 1
clf = CatBoostRegressor(loss_function='RMSE',
                            eval_metric="R2", 
                            random_seed=14,
                            colsample_bylevel = parameters[i]['colsample_bylevel'][0],
                            learning_rate = parameters[i]['learning_rate'][0], 
                            max_depth = parameters[i]['max_depth'][0],
                            reg_lambda = parameters[i]['reg_lambda'][0],
                            n_estimators = parameters[i]['n_estimators'][0])
       
evaluation = [( X_train, y_train), ( X_test, y_test)]
    
clf.fit(X_train, y_train, 
            #early_stopping_rounds=10,
            eval_set=evaluation,
            verbose=False)

pred = clf.predict(xtest)
test['Amount2'] = pred
test.loc[test['Condition']==0, 'Amount2'] = 0

In [None]:
test['Amount'] = (test.Amount0+test.Amount1+test.Amount2)/3

In [None]:
sub = test[['Image_path', 'Condition', 'Amount']]
sub = sub.set_index('Image_path')
sub.to_csv('Pred.csv')

In [None]:
import seaborn as sns
feature_imp = pd.DataFrame(sorted(zip(clf.feature_importances_, X_train.columns), reverse=True)[:50], 
                           columns=['Value','Feature'])
plt.figure(figsize=(10,3))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('Catboost Features')
plt.tight_layout()
plt.show()