In [None]:
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
import sys

In [None]:
df=pd.read_csv("../input/bluebook-for-bulldozers/TrainAndValid.csv",
               low_memory=False,
               parse_dates=["saledate"])

In [None]:
df.shape 

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
missing_values=df.isna().sum()/len(df)*100
missing_values=missing_values.sort_values(ascending=False)

In [None]:
missing_values

In [None]:
fig, ax=plt.subplots(figsize=(10,10))

ax.scatter(df["saledate"][:1000],
              df["SalePrice"][:1000] ,
             c="grey")
ax.set_xlabel("Sale date")
ax.set_ylabel("Sale price")
ax.set_title("evolution of prices with time")
ax.legend()

In [None]:
df.SalePrice[df["SalePrice"]!="Test_data"].plot.hist()

In [None]:
x=pd.to_datetime(df["saledate"])

In [None]:
x.head()

In [None]:
type(x)

In [None]:
df["year_sale"] = pd.DatetimeIndex(df["saledate"]).year

In [None]:
df.year_sale.plot.hist()

In [None]:
fig, ax=plt.subplots(figsize=(8,8))
ax.scatter(df["saledate"][:1000], df["SalePrice"][:1000])

In [None]:
x = pd.DatetimeIndex(df["saledate"]).year
z=df.saledate.dt.year

In [None]:
plt.hist(z);

In [None]:
df_tmp=df.copy()

df_tmp["saleyear"]=df_tmp.saledate.dt.year
df_tmp["salemonth"]=df_tmp.saledate.dt.month
df_tmp["saleday"]=df_tmp.saledate.dt.day
df_tmp["saledayofyear"]=df_tmp.saledate.dt.dayofyear
df_tmp["saledayofweek"]=df_tmp.saledate.dt.dayofweek

In [None]:
df_tmp.head().T

In [None]:
df_tmp.drop(["saledate"], axis=1, inplace=True)
df_tmp.head().T

In [None]:
df_tmp.state.value_counts().plot(kind="bar", figsize=(12,12))

In [None]:
sale_state=pd.DataFrame(df_tmp.state.value_counts())

In [None]:
sale_state["cumulatif_quantity"]=(sale_state["state"].cumsum()/sale_state["state"].sum())*100

In [None]:
sale_state
# more than 40% of sales occured in only 4 states : Florida, Texas, California and Washington

In [None]:
df_tmp.YearMade.value_counts().sort_values(ascending=False)
# many values are inserted incorrectly: we can't find engins made in the year 1000 ==> 39391 observations.

In [None]:
df_tmp.YearMade.plot.hist()

In [None]:
df_tmp[df.YearMade>=1950].YearMade.plot.hist()
# we want to see the distrbution of the made year but with real values, 
#so we eliminate the data that could be considered as an error
# a lot of engin sold were made in between 1990 and 2005/2008.

In [None]:
%matplotlib inline
fig,ax=plt.subplots(figsize=(6,6))
ax.scatter(df_tmp[df.YearMade>=1950].YearMade[:1000],df_tmp[df.YearMade>=1950].SalePrice[:1000],
       color='salmon')
#it confirms the findings below: the recent is the year the engin is made the more expensive it is.

In [None]:
df_tmp.UsageBand.value_counts()

In [None]:
pd.crosstab(df_tmp.state,df_tmp.UsageBand)

In [None]:
df_tmp.columns

In [None]:
df_tmp.ProductSize.value_counts().plot(kind="bar", figsize=(8,8))

In [None]:
df_tmp.ProductGroup.value_counts().plot(kind="bar", figsize=(8,8))

In [None]:
df_tmp.ProductGroupDesc.value_counts()

In [None]:
pd.crosstab(df_tmp.ProductGroup, df_tmp.ProductGroupDesc)
##these two variables mean the same thing so the correlation between them is equal to 1==>thus, one of them must be eliminated

In [None]:
df_tmp.drop(["ProductGroupDesc"], axis=1, inplace=True)

In [None]:
pd.crosstab(df_tmp.ProductGroup, df_tmp.ProductSize).plot(kind="bar")

In [None]:
df_tmp.Pad_Type.value_counts()

In [None]:
categorical_variables=['UsageBand','fiModelDesc', 'fiBaseModel', 'fiSecondaryDesc', 'fiModelSeries',
                       'fiModelDescriptor', 'ProductSize', 'fiProductClassDesc', 'state',
                       'ProductGroup', 'Drive_System', 'Enclosure',
                       'Forks', 'Pad_Type', 'Ride_Control', 'Stick', 'Transmission',
                       'Turbocharged', 'Blade_Extension', 'Blade_Width', 'Enclosure_Type',
                       'Engine_Horsepower', 'Hydraulics', 'Pushblock', 'Ripper', 'Scarifier',
                       'Tip_Control', 'Tire_Size', 'Coupler', 'Coupler_System',
                       'Grouser_Tracks', 'Hydraulics_Flow', 'Track_Type',
                       'Undercarriage_Pad_Width', 'Stick_Length', 'Thumb', 'Pattern_Changer',
                       'Grouser_Type', 'Backhoe_Mounting', 'Blade_Type', 'Travel_Controls',
                       'Differential_Type', 'Steering_Controls']

In [None]:
for col in categorical_variables:
    print(f"variable {col} components are {len(df_tmp[col].value_counts())} :")
    #print(df_tmp[col].value_counts())
    print(" ")

In [None]:
df_tmp.drop(["SalesID", "saledayofyear", "MachineID"], axis=1, inplace=True)

In [None]:
df_tmp.columns

In [None]:
sns.set_theme(style="white")

corr=df_tmp.corr()

mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmin=-1, vmax=1, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
df_tmp.saleyear.unique()

In [None]:
df_tmp.datasource.unique()

In [None]:
pd.crosstab(df_tmp.saleyear,df_tmp.datasource)
#even though exit a certain correlation between these two variables close to 50%, 
#we will not delete any of them because we see in the cross tab that starting from 2006, 
#there is some variability in the values which can give our model some information ==> 
#the final decision will be given when we mount our model

In [None]:
missing_values=(df_tmp.isna().sum()/len(df_tmp))*100
missing_values[missing_values!=0].sort_values(ascending=False)

In [None]:
for label, content in df_tmp.items():
    if pd.api.types.is_object_dtype(content):
        print(label)

In [None]:
for label, content in df_tmp.items():
    if pd.api.types.is_string_dtype(content):
        df_tmp[label]=content.astype("category")

In [None]:
df_tmp.isnull().sum()/len(df_tmp)*100

In [None]:
# df_tmp.to_csv("C:/..../bulldozers/data/blue_book_train.csv",index=False)

In [None]:
# df_tmp=pd.read_csv("C:/.../bulldozers/data/blue_book_train.csv",low_memory=False)

In [None]:
df_tmp.isnull().sum()

In [None]:
numerical_variables=[]
categorical_variables=[]
for label, content in df_tmp.items():
    if pd.api.types.is_numeric_dtype(content):
        numerical_variables.append(label)
        if (pd.isnull(content).sum()!=0):
            df_tmp[label]=content.fillna(content.median())
    if pd.api.types.is_string_dtype(content):
        df_tmp[label]=df_tmp[label].astype("category")
        categorical_variables.append(label)
        if (pd.isnull(content).sum()!=0):
            df_tmp[label]=content.fillna(content.value_counts().idxmax())

In [None]:
for label, content in df_tmp.items():
    if not pd.api.types.is_numeric_dtype(content):
        df_tmp[label]=pd.Categorical(content).codes

In [None]:
df_tmp.head()

In [None]:
df_tmp.columns
df_tmp.drop(["year_sale"], axis=1, inplace=True)

In [None]:
%%time
from sklearn.ensemble import RandomForestRegressor

model=RandomForestRegressor(n_jobs=-1,
                            random_state=42)

model.fit(df_tmp.drop(["SalePrice"], axis=1), df_tmp["SalePrice"])

In [None]:
model.score(df_tmp.drop(["SalePrice"], axis=1), df_tmp["SalePrice"])

In [None]:
train_set=df_tmp[df_tmp.saleyear<2012]
validation_set=df_tmp[df_tmp.saleyear>=2012]
train_set.shape, validation_set.shape

In [None]:
(len(train_set)+len(validation_set))==len(df_tmp)

In [None]:
#split the data:
X_train, y_train=train_set.drop(["SalePrice"], axis=1), train_set.SalePrice
X_valid, y_valid=validation_set.drop(["SalePrice"], axis=1), validation_set.SalePrice

In [None]:
X_train.shape

In [None]:
%%time
model.fit(X_train, y_train)

In [None]:
model.score(X_train, y_train)

In [None]:
model.score(X_valid, y_valid)

In [None]:
from sklearn.metrics import mean_squared_log_error, mean_absolute_error, r2_score 

def show_scores(model):
    
    scores={"Training MAE": mean_absolute_error(y_train, model.predict(X_train)),
           "Valid MAE": mean_absolute_error(y_valid, model.predict(X_valid)),
           "Training R2": r2_score(y_train, model.predict(X_train)),
           "valid R2": r2_score(y_valid, model.predict(X_valid)),
           "Training rmsle": np.sqrt(mean_squared_log_error(y_train, model.predict(X_train))),
           "Valid rmsle": np.sqrt(mean_squared_log_error(y_valid, model.predict(X_valid)))}
    return scores

In [None]:
show_scores(model)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
rf_grid={"n_estimators": [90,100,110], 
         "max_depth":[None,1,2], 
          "min_samples_split":[2,4,None],
          "min_samples_leaf": [1,2,3],
          "max_features":["auto", "sqrt"],
           "max_samples":[10000]
        }
rs_model=RandomizedSearchCV(RandomForestRegressor(n_jobs=-1,
                                                 random_state=42),
                           param_distributions=rf_grid,
                           n_iter=50,
                           cv=5,
                           verbose=True)

In [None]:
%%time
rs_model.fit(X_train, y_train)

In [None]:
show_scores(rs_model)
# the model gives lower rmsle values when a larger set is used, here to do tyhings quickly, we used only 10000 observations

In [None]:
rs_model.best_params_

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {"n_estimators": [90,100,110], 
              "max_depth":[None,1,2], 
              "min_samples_split":[2,4,None],
              "min_samples_leaf": [1,2,3],
              "max_features":["auto", "sqrt"],
              "max_samples":[10000]
             }

# Create a based model
rf = RandomForestRegressor(n_jobs = -1, random_state=42)
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 5,verbose = 2)

In [None]:
%%time
grid_search.fit(X_train, y_train)

In [None]:
show_scores(grid_search)

In [None]:
show_scores(rs_model)

In [None]:
show_scores(model)
# consedering our computational power (it takes too much time), 
# we can say that the best model is the one with default parameters, 
# but if we had more computational power we can do better than the default.

In [None]:
print(model)
print(rs_model)
print(grid_search)

In [None]:
%%time
# test set estimation using the model above "rs_model"
# we first need to work on test set in order to preprocess the data then use our model to predict the sale price

df_test=pd.read_csv("../input/bluebook-for-bulldozers/Test.csv",
               low_memory=False,
               parse_dates=["saledate"])

df_test["saleyear"]=df_test.saledate.dt.year
df_test["salemonth"]=df_test.saledate.dt.month
df_test["saleday"]=df_test.saledate.dt.day
df_test["saledayofyear"]=df_test.saledate.dt.dayofyear
df_test["saledayofweek"]=df_test.saledate.dt.dayofweek
sales_id=df_test.SalesID.values
df_test.drop(["saledate","ProductGroupDesc","SalesID", "saledayofyear", "MachineID"], axis=1, inplace=True)

In [None]:
for label, content in df_test.items():
    if pd.api.types.is_string_dtype(content):
        df_test[label]=content.astype("category")

In [None]:
numerical_variables=[]
categorical_variables=[]
for label, content in df_test.items():
    if pd.api.types.is_numeric_dtype(content):
        numerical_variables.append(label)
        if (pd.isnull(content).sum()!=0):
            df_test[label]=content.fillna(content.median())
    if pd.api.types.is_string_dtype(content):
        df_test[label]=df_test[label].astype("category")
        categorical_variables.append(label)
        if (pd.isnull(content).sum()!=0):
            df_test[label]=content.fillna(content.value_counts().idxmax())

In [None]:
for label, content in df_test.items():
    if not pd.api.types.is_numeric_dtype(content):
        df_test[label]=pd.Categorical(content).codes

In [None]:
# we reestimate our model using the best params
# best_model=RandomForestRegressor(n_estimators= 30,min_samples_split= 94,min_samples_leaf= 9,max_features='sqrt',max_depth= None,n_jobs=-1,random_state=42)
# best_model=RandomForestRegressor(n_jobs=-1, random_state=42)
# best_model.fit(X_train, y_train)
# show_scores(best_model)

y_test_predictions=model.predict(df_test)

In [None]:
data={"SalesID": sales_id,
     "Price_Predictions":y_test_predictions}
data_predictions=pd.DataFrame(data)

In [None]:
data_predictions