In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
import seaborn as sns
import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('../input/apporchid'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv('../input/apporchid/training.csv').set_index('RefId')
test = pd.read_csv('../input/apporchid/test.csv').set_index('RefId')

train['kind'] = 'train'
test['kind'] = 'test'


In [None]:
dataset = pd.concat([train, test])

In [None]:
dataset.info()

# PurchDate dtype is object

In [None]:
dataset.IsBadBuy.value_counts()

### This is a imbalanced Dataset, we can try out SMOTE , RF for working with such data.

In [None]:
dataset.isnull().sum()

In [None]:
corr = dataset.corr()
import matplotlib.pyplot as plt

plt.matshow(dataset.corr())
plt.show()

In [None]:
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

In [None]:
import seaborn as sns

f, ax = plt.subplots(figsize=(10, 8))
corr = dataset.corr()
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax)

### We can see that MMRA columns have high coorelation, We will deal with them later.
### Vehicle Year ---------VehicleAge 
### VehicleAge with MMRA Columns
### Also there is not much coorelation between Dependent variable and independent variable. 

In [None]:
# List of column with Object dtyoe for Lable Encoding
dataset.columns.to_series().groupby(dataset.dtypes).groups

In [None]:
df_boxplot = dataset[['VehYear', 'VehicleAge', 'VehOdo', 'BYRNO', 'VNZIP1', 'IsOnlineSale', 'WarrantyCost', 'MMRAcquisitionAuctionAveragePrice', 
           'MMRAcquisitionAuctionCleanPrice', 'MMRAcquisitionRetailAveragePrice', 'MMRAcquisitonRetailCleanPrice', 'MMRCurrentAuctionAveragePrice', 
           'MMRCurrentAuctionCleanPrice', 'MMRCurrentRetailAveragePrice', 'MMRCurrentRetailCleanPrice', 'VehBCost']]

In [None]:
df_pairplot = dataset[['VehYear', 'VehicleAge', 'VehOdo', 'BYRNO', 'VNZIP1', 'IsOnlineSale', 'WarrantyCost', 'MMRAcquisitionAuctionAveragePrice', 'VehBCost']]

In [None]:
# Univaritata analysis for the Numeric data

for column in df_boxplot:
    plt.figure()
    df_boxplot.boxplot([column])

### Clearly there are outliers in the dataset, they have to be treated before data modelling.

### Checking for the distribution of the dataset

In [None]:
# Bivariate Analysis
sns.pairplot(df_pairplot)

In [None]:
# Dropping columns with missing values

dataset = dataset.drop(['PRIMEUNIT', 'AUCGUART'], axis = 1)

In [None]:
dataset.PurchDate = pd.to_datetime(dataset.PurchDate)

In [None]:
dataset['year'] = dataset['PurchDate'].dt.year
dataset['month'] = dataset['PurchDate'].dt.month
dataset['day'] = dataset['PurchDate'].dt.day

In [None]:
# Dropping the date columns

dataset = dataset.drop(['PurchDate'], axis = 1)

In [None]:
obj_dtype = ['Auction', 'Make', 'Model', 'Trim', 'SubModel', 'Color', 'Transmission', 'WheelType', 'Nationality', 'Size', 'TopThreeAmericanName', 'VNST']

for col in obj_dtype:
    dataset[col] = le.fit_transform(dataset[col])

In [None]:
dataset.head()

## Handling missing values

In [None]:
dataset.isna().sum()

### replacing NAN by mode of the columns

In [None]:
dataset = dataset.fillna(dataset.mean())

In [None]:
dataset.isna().sum()

In [None]:
dataset.head()

In [None]:
dataset.kind.value_counts()

In [None]:
train_df = dataset[dataset['kind'] == 'train']
test_df = dataset[dataset['kind'] == 'test']


In [None]:

train_df = train_df.drop(['kind'], axis=1)
test_df = test_df.drop(['kind', 'IsBadBuy'], axis = 1)

In [None]:
test_df.head()

In [None]:
train_df.head()

In [None]:
X = train_df.iloc[:,1:]
y = train_df['IsBadBuy']

In [None]:
X.info()

In [None]:
# Creating BaseLine Model
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.70)

clf = RandomForestClassifier(max_depth=5, n_estimators = 100, random_state=0)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

In [None]:
#Calculating F1 Score for the BaseLine Model
from sklearn.metrics import f1_score

# using metrics module for accuracy calculation
print("F1 Score OF THE MODEL: ", f1_score(y_test, y_pred, average='weighted'))

### F1 Score of BASELINE MODEL:  0.851619507920779

In [None]:
# Dropping the coorelaed columns which are mentioned below
#1. MMRA columns.
#2. Vehicle Year & VehicleAge 
#3. VehicleAge & MMRA Columns

# Dropping VehicleAge, MMRA columns 
dataset = dataset.drop(['MMRAcquisitionAuctionCleanPrice', 'VehicleAge',
       'MMRAcquisitionRetailAveragePrice', 'MMRAcquisitonRetailCleanPrice',
       'MMRCurrentAuctionAveragePrice', 'MMRCurrentAuctionCleanPrice',
       'MMRCurrentRetailAveragePrice', 'MMRCurrentRetailCleanPrice'], axis=1)

train_df = dataset[dataset['kind'] == 'train']
test_df = dataset[dataset['kind'] == 'test']

In [None]:
train_df = train_df.drop(['kind'], axis=1)
test_df = test_df.drop(['kind', 'IsBadBuy'], axis = 1)

In [None]:
X = train_df.iloc[:,1:]
y = train_df['IsBadBuy']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.70)

clf = RandomForestClassifier(max_depth=5, n_estimators = 100, random_state=0)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("F1 Score OF THE MODEL: ", f1_score(y_test, y_pred, average='weighted'))

### F1 Score OF THE MODEL:  0.8663583122603433

### The F1 score of the model has increased from 0.852 to 0.864

### We can try following other techniques for increasing the score
1. Hyperparameter tuning of the Random Forest
2. Treatment of Imbalanced dataset
3. Check if the features follow parametric distribution and change them if needed. 
4. Try boosting techniques such as XGBoost, CatBoost.
5. Trying Stacking the model at the end to check if the scores improve.

In [None]:
## Treating the imbalanced dataset

X = train_df.iloc[:,1:]
y = train_df['IsBadBuy']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.70)

clf = RandomForestClassifier(max_depth=10, n_estimators = 100, random_state=0, class_weight='balanced')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("F1 Score OF THE MODEL: ", f1_score(y_test, y_pred, average='weighted'))

### The F1 score has decreased when I tried to treat the imbalanced dataset, this may