In [None]:
!pip install plotly --quiet

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

In [None]:
os.listdir("/kaggle/input/DontGetKicked")

In [None]:
train = pd.read_csv("/kaggle/input/DontGetKicked/training.csv")
test = pd.read_csv("/kaggle/input/DontGetKicked/test.csv")
example_entry = pd.read_csv("/kaggle/input/DontGetKicked/example_entry.csv")

In [None]:
train.head()

In [None]:
train.shape

In [None]:
test.head()

In [None]:
test.shape

# Data Description

>**RefID**			->	    Unique (sequential) number assigned to vehicles

>**IsBadBuy**		->		Identifies if the kicked vehicle was an avoidable purchase 

>**PurchDate**		->		The Date the vehicle was Purchased at Auction

>**Auction**			->		Auction provider at which the  vehicle was purchased

>**VehYear**			->		The manufacturer's year of the vehicle

>**VehicleAge**		->		The Years elapsed since the manufacturer's year

>**Make**			->		Vehicle Manufacturer 

>**Model**			->		Vehicle Model

>**Trim**			->		Vehicle Trim Level

>**SubModel**		->		Vehicle Submodel

>**Color**			->		Vehicle Color

>**Transmission**		->		Vehicles transmission type (Automatic, Manual)

>**WheelTypeID**			->	The type id of the vehicle wheel

>**WheelType**			->	The vehicle wheel type description (Alloy, Covers)

>**VehOdo**				->	The vehicles odometer reading

>**Nationality**			->	The Manufacturer's country

>**Size**				->	The size category of the vehicle (Compact, SUV, etc.)

>**TopThreeAmericanName**	->		Identifies if the manufacturer is one of the top three American manufacturers

>**MMRAcquisitionAuctionAveragePrice**	->  Acquisition price for this vehicle in average condition at time of purchase	

>**MMRAcquisitionAuctionCleanPrice**		->  Acquisition price for this vehicle in the above Average condition at time of purchase

>**MMRAcquisitionRetailAveragePrice**	->  Acquisition price for this vehicle in the retail market in average condition at time of 
purchase

>**MMRAcquisitonRetailCleanPrice**		->  Acquisition price for this vehicle in the retail market in above average condition at time of purchase

>**MMRCurrentAuctionAveragePrice**	->	Acquisition price for this vehicle in average condition as of current day	

>**MMRCurrentAuctionCleanPrice**		->  Acquisition price for this vehicle in the above condition as of current day

>**MMRCurrentRetailAveragePrice**	->  Acquisition price for this vehicle in the retail market in average condition as of current day

>**MMRCurrentRetailCleanPrice**	->	Acquisition price for this vehicle in the retail market in above average condition as of current day

>**PRIMEUNIT**		->		Identifies if the vehicle would have a higher demand than a standard purchase

>**AcquisitionType**		->		Identifies how the vehicle was aquired (Auction buy, trade in, etc)

>**AUCGUART**			->	The level guarntee provided by auction for the vehicle (Green light - Guaranteed/arbitratable, Yellow Light - caution/issue, red light - sold as is)

>**KickDate**			->	Date the vehicle was kicked back to the auction

>**BYRNO**				->	Unique number assigned to the buyer that purchased the vehicle

>**VNZIP**                 ->                  Zipcode where the car was purchased

>**VNST**                    ->                State where the the car was purchased

>**VehBCost**		->		Acquisition cost paid for the vehicle at time of purchase

>**IsOnlineSale**		->		Identifies if the vehicle was originally purchased online

>**WarrantyCost**          ->                  Warranty price (term=36month  and millage=36K) 

In [None]:
train.info()

>**PurchDate** (Date of Purchase of car from auction) won't be any help in predicting if the car was a good or bad buy.

>Similarly we don't have any use of **RefId** Column.

>Also, **BYRNO** i.e. Unique number assigned to each buyer in auction is not useful as the buyer won't affect the quality of car.

In [None]:
train.drop(columns=["RefId", "PurchDate", "BYRNO"], inplace = True)

# Visualizing Correlations

Correlation is a very useful technique to get rid of unecessary columns from our dataset that don't affect our target.

In [None]:
plt.figure(figsize=(18, 18))
matrix = np.triu(train.corr())
sns.heatmap(train.corr(), annot=True, linewidths=.5, fmt= '.1f', mask=matrix)
plt.show()

**From above figure we can see that `VNZIP1` and `WheelTypeID` have no correlation with our target `IsBadBuy`. So we can safely drop them.**

In [None]:
train.drop(columns = ["VNZIP1", "WheelTypeID"], inplace = True)

# Missing Values (%ages)

Let's check for the % of missing values in each column.

In [None]:
train.isna().sum()*100 / len(train)

**From above it is clear that `PRIMEUNIT` and `AUCGUART` columns have more than `95%` of values missing. So it is best to drop them as it is.**

**Also since we cannot guess the `Trim` and `WheelType` of cars just randomly, we are also dropping the same.**

**`SubModel` doesn't provide much valuable information and `Color` doesn't wouldn't certainly affect the car quality. Also, we don't need the `TopThreeAmericanName` as we already have `Make` and `Nationality` columns.**

In [None]:
train.drop(columns = ["PRIMEUNIT", "AUCGUART", "Trim", "WheelType", "SubModel", "Color", "TopThreeAmericanName"], inplace = True)

**Lets check the percent of missing values once again.**

In [None]:
train.isna().sum()*100 / len(train)

In [None]:
train.shape

**Since we have a fairly small fraction of missing values, we can safely drop the rows with missing data.**

In [None]:
train.dropna(inplace = True)

In [None]:
train.shape

In [None]:
train.isna().sum()

In [None]:
categorical_columns = train.select_dtypes(include = 'object')
numerical_columns = train.select_dtypes(exclude='object')

In [None]:
print(f"Number of Categorical Columns: {categorical_columns.shape[1]}")
print(f"Number of Numerical Columns: {numerical_columns.shape[1]}")

**We have now 15 numerical columns and 7 categorical columns in our data.**

# VehYear vs IsBadBuy

In [None]:
sns.countplot(x = "VehYear", data = train, hue = "IsBadBuy")

Nothing seems irrational. As the number of listings increased, so did the number of bad buys.

# VehicleAge vs IsBadBuy

In [None]:
sns.countplot(x = "VehicleAge", data = train, hue = "IsBadBuy")

# Auction vs IsBadBuy

In [None]:
px.histogram(train, x = "Auction", color = "IsBadBuy")

# Make vs IsBadBuy

In [None]:
px.histogram(train, x = "Make", color = "IsBadBuy")

**Dogde, Ford, Chevrolet and Chrysler** cars account for the most listed as well as most bad buys. That makes sense. If a company has more listings, it will as well have higher bad buys.

In [None]:
px.histogram(train, x = "Nationality", color = "IsBadBuy")

**Probability of a car being a bad buy given it's Nationality.**

In [None]:
other_asian = len(train[(train['Nationality']=="OTHER ASIAN") & train["IsBadBuy"]==1])*100/len(train[train['Nationality']=="OTHER ASIAN"])
american = len(train[(train['Nationality']=="AMERICAN") & train["IsBadBuy"]==1])*100/len(train[train['Nationality']=="AMERICAN"])
top_line_asian = len(train[(train['Nationality']=="TOP LINE ASIAN") & train["IsBadBuy"]==1])*100/len(train[train['Nationality']=="TOP LINE ASIAN"])
other = len(train[(train['Nationality']=="OTHER") & train["IsBadBuy"]==1])*100/len(train[train['Nationality']=="OTHER"])

In [None]:
print(f"OTHER ASIAN: {round(other_asian)}%")
print(f"AMERICAN: {round(american)}%")
print(f"TOP LINE ASIAN: {round(top_line_asian)}%")
print(f"OTHER: {round(other)}%")

# Categorical Data

In [None]:
categorical_columns

### Auction

In [None]:
#Number of unique categories for Auction
categorical_columns.Auction.nunique()

In [None]:
categorical_columns.Auction.unique()

### Make

In [None]:
#Number of unique categories for Make
categorical_columns.Make.nunique()

In [None]:
categorical_columns.Make.unique()

### Model

In [None]:
#Number of unique categories for Model
categorical_columns.Model.nunique()

### Transmission

In [None]:
#Number of unique categories for Transmission
categorical_columns.Transmission.nunique()

In [None]:
categorical_columns.Transmission.unique()

**It seems that Transmission column has naming error. Let's replace 'Manual' with 'MANUAL'.**

In [None]:
train.Transmission.replace(to_replace='Manual', value = "MANUAL", inplace = True)

In [None]:
train.Transmission.unique()

**Now it is rectified.**

### Nationality

In [None]:
#Number of unique categories for Nationality
categorical_columns.Nationality.nunique()

In [None]:
categorical_columns.Nationality.unique()

**Let's merge OTHER ASIAN and TOP LINE ASIAN into a single ASIAN category. This will make more sense.**

In [None]:
train.Nationality.replace(to_replace=["OTHER ASIAN", "TOP LINE ASIAN"], value = "ASIAN", inplace = True)

In [None]:
train.Nationality.unique()

**Now it seems more rational.**

### Size

In [None]:
categorical_columns.Size.nunique()

In [None]:
categorical_columns.Size.unique()

### VNST

In [None]:
categorical_columns.VNST.nunique()

In [None]:
categorical_columns.VNST.unique()

## Features and Target

In [None]:
X = train.drop(columns="IsBadBuy")
y = train["IsBadBuy"]

In [None]:
X

In [None]:
y

In [None]:
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

## make_pipeline

Construct a Pipeline from the given estimators. Check [make_pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.make_pipeline.html) for details.

## make_column_transformer

It allows for applying different transformations to different columns in a pipeline. For example, we need different preprocessing steps for numerical and categorical columns. This allows us to apply different steps to specified columns in a pipeline.

Check [make_column_transformer](https://scikit-learn.org/stable/modules/generated/sklearn.compose.make_column_transformer.html) for more details. Also check [Column Transformers Examples](https://machinelearningmastery.com/columntransformer-for-numerical-and-categorical-data/) for example implementation.

## StandardScaler

Standardize features by removing the mean and scaling to unit variance

The standard score of a sample x is calculated as:

z = (x - u) / s

where u is the mean of the training samples or zero if with_mean=False, and s is the standard deviation of the training samples or one if with_std=False.

Centering and scaling happen independently on each feature by computing the relevant statistics on the samples in the training set. Mean and standard deviation are then stored to be used on later data using transform.

Standardization of a dataset is a common requirement for many machine learning estimators: they might behave badly if the individual features do not more or less look like standard normally distributed data (e.g. Gaussian with 0 mean and unit variance).

Check [StandardScaler](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html) for more details.

## OneHot Encoder

Encode categorical features as a one-hot numeric array.

The input to this transformer should be an array-like of integers or strings, denoting the values taken on by categorical (discrete) features. The features are encoded using a one-hot (aka ‘one-of-K’ or ‘dummy’) encoding scheme. This creates a binary column for each category and returns a sparse matrix or dense array (depending on the sparse parameter)

By default, the encoder derives the categories based on the unique values in each feature. Alternatively, you can also specify the categories manually.

This encoding is needed for feeding categorical data to many scikit-learn estimators, notably linear models and SVMs with the standard kernels.

Check [OneHot Encoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html) for more details.

In [None]:
cat_cols_list = X.dtypes[X.dtypes.isin(['object','category'])].index.to_list()
cat_cols_list

In [None]:
num_cols_list = list(X.select_dtypes(exclude=['object']).columns)
num_cols_list

**Split the data in training and validation sets.**

In [None]:
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### Preprocessing

We are going to use column transformer to One-hot encode the categorical variables and scale the numerical variables.

In [None]:
preprocess = make_column_transformer((StandardScaler(), num_cols_list), (OneHotEncoder(categories='auto', handle_unknown='ignore'), cat_cols_list))

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
pipeline1 = make_pipeline(preprocess, RandomForestClassifier())
pipeline1.fit(x_train, y_train)

In [None]:
print(f"Random Forest Train Set: {pipeline1.score(x_train, y_train)}")
print(f"Random Forest Validation Set: {pipeline1.score(x_val, y_val)}")

In [None]:
pipeline2 = make_pipeline(preprocess, LogisticRegression(solver = 'liblinear'))
pipeline2.fit(x_train, y_train)

In [None]:
print(f"Logistic Regression Train Set: {pipeline2.score(x_train, y_train)}")
print(f"Losgistic Regression Val Set: {pipeline2.score(x_val, y_val)}")