# This kernel uses two methods to solve the problem, processing with Fastai gives better results, but I would recommend anyone to go through the full code and be able to understand the differences in those methods, which gives us such a drastic variation in score

## Importing all the necessary stuff

In [None]:
import numpy as np 
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor

import os

## This is the basic code for accessing the data

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Getting the Training and the Testing data

In [None]:
train = pd.read_csv('/kaggle/input/bluebook-for-bulldozers/trainandvalid/TrainAndValid.csv')
test = pd.read_csv('/kaggle/input/bluebook-for-bulldozers/Test.csv')

## Finding the type of data in those files

In [None]:
train.info()

In [None]:
test.info()

In [None]:
train.head()

In [None]:
train.columns

In [None]:
train.describe(include='all')

## Since this file has a lot of columns, lets check the number of NaN(s).....

In [None]:
train.isna().sum()

## Looking at the type of values in each column. Other columns can be checked by replacing the column names

In [None]:
train.fiProductClassDesc

## Normalizing the data

## This time we are using the log of the Sale Price

In [None]:
train['SalePrice'] = np.log(train.SalePrice)

## Looking at the description above and considering the number of NaNs in each column, we would be better to consider only the really important columns

In [None]:
features_to_consider = ['YearMade', 'datasource', 'state', 'fiBaseModel', 'fiProductClassDesc' , 'fiModelDesc']

### This function will take in the training and validation data, and output mean squared error. This function will basically tell us how distributed our data really is.

In [None]:
def model_score(model, X_trn, y_trn, X_val, y_val):
    model.fit(X_trn, y_trn)
    pred = model.predict(X_val)
    return np.sqrt(mse(pred, y_val))

## Data preparation and preprocessing

In [None]:
X = train[features_to_consider]
y = train.SalePrice

## We will be using the Label Encoder as the number of unique values in each column are a lot. If we were to use One-Hot Encoding, the number of columns would increase drastically

In [None]:
LabelEnc = LabelEncoder()
X['state']=LabelEnc.fit_transform(X.state)
X['fiBaseModel']= LabelEnc.fit_transform(X.fiBaseModel)
X['fiProductClassDesc']= LabelEnc.fit_transform(X.fiProductClassDesc)
X['fiModelDesc']= LabelEnc.fit_transform(X.fiModelDesc)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## The first model we use is the Linear Regression

In [None]:
model = LinearRegression()
model_score(model, X_train, y_train, X_test, y_test)

## This is the Random Forest Regressor

In [None]:
model = RandomForestRegressor(max_depth=30, min_samples_split=20, n_estimators=110, n_jobs= -1)
model_score(model, X_train, y_train, X_test, y_test)

## Let's use Multi Layered Perceptrons too

In [None]:
model = MLPRegressor(hidden_layer_sizes=(100), activation="relu", solver="adam", alpha=0.0001, verbose=True)
model_score(model, X_train, y_train, X_test, y_test)

## Looking at the above mse(s), we can safely assume that our data is spread out all over

# While looking at the different types of methods to solve these type of problems, I came accross a Deep Learning library called fastai.

## Let's try it out

## This is the basic installation to be able to use fastai.structured

In [None]:
!pip install git+https://github.com/fastai/fastai@2e1ccb58121dc648751e2109fc0fbf6925aa8887

## Importing(again!!!)

In [None]:
import os

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from pandas_summary import DataFrameSummary
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from time import time


## These are the fastai imports
from fastai.imports import *
from fastai.structured import *

## Loading the data

In [None]:
data = pd.read_csv('/kaggle/input/bluebook-for-bulldozers/trainandvalid/TrainAndValid.csv', low_memory=False, parse_dates=["saledate"])
data.head()

### ‘train_cats’ method is used for turning ‘string’ type columns into ‘category’ type columns

In [None]:
train_cats(data)
data.head()

### We will use add_datepart helper function to add columns relevant to a date in the salesdate column

In [None]:
add_datepart(data, 'saledate')

### Let's take a look at the data now

In [None]:
data.head()

### We can see that the salesdate column is now gone, and instead we get different columns with different details of the date like day, days of the week, day of the year, etc. all of which are good determining factors for the Sale Price of the product

### Now for some of the other useful columns in this dataset, we can see that UsageBand can influence the Sale Price
### We can map it into numbers manually, but we will use the astype() function here

In [None]:
data.UsageBand = data.UsageBand.astype('category')
data.UsageBand = data.UsageBand.cat.codes

### Usage band has been converted into int instead of string 

In [None]:
data.head()

### Lets normalize the data
### We will be using log for that

In [None]:
data['SalePrice'] = np.log(data['SalePrice'])
data['SalePrice'].head()

### This looks much simpler now

### Lets take at the empty values in our table, and for that we will be using proc_df function
1. For continuous variables, it checks whether a column has missing values or not
2. If the column has missing values, it creates another column called columnname_na, which has 1 for missing and 0 for not missing
3. Simultaneously, the missing values are replaced with the median of the column
4. For categorical variables, pandas replaces missing values with -1. So proc_df adds 1 to all the values for categorical variables. Thus, we have 0 for missing while all othervalues are incremented by 1

In [None]:
finalData, Y, nas = proc_df(data, 'SalePrice')

In [None]:
finalData.head()

### Looks like we got rid of all the empty values in the table

In [None]:
print(Y)
len(Y)

### We will use Random Forest Regressor

In [None]:
model = RandomForestRegressor(n_jobs=-1)

### Training the data and getting the score

In [None]:
model.fit(finalData, Y)
model.score(finalData, Y)

### This looks comparatively better, but let's try splitting the data 

### Splitting the data into training and testing

In [None]:
X_train, X_test, y_train, y_test = train_test_split(finalData, Y, test_size=0.33, random_state=42)

In [None]:
model.fit(X_train, y_train)
model.score(X_test, y_test)

### Finally we have a values that we can agree with, this gives a good score as compared to the previous efforts

In [None]:
print(model.score(X_test, y_test) * 100)