In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("../input/bluebook-for-bulldozers/TrainAndValid.csv")

In [None]:
df.info()

In [None]:
fig, ax = plt.subplots()
ax.scatter(df["saledate"][:1000], df["SalePrice"][:1000])

In [None]:
df.SalePrice.plot.hist()

In [None]:
df = pd.read_csv("../input/bluebook-for-bulldozers/TrainAndValid.csv",
                 low_memory=False,
                 parse_dates=["saledate"])

In [None]:
df.info()

In [None]:
fig, ax = plt.subplots()
ax.scatter(df["saledate"][:1000], df["SalePrice"][:1000])

In [None]:
df.head()

In [None]:
df.head().T

In [None]:
df.saledate.head(20)

In [None]:
df.sort_values(by=["saledate"], inplace=True, ascending=True)
df.saledate.head(20)

In [None]:
df_tmp = df.copy()

In [None]:
df_tmp["saleYear"] = df_tmp.saledate.dt.year
df_tmp["saleMonth"] = df_tmp.saledate.dt.month
df_tmp["saleDay"] = df_tmp.saledate.dt.day
df_tmp["saleDayofweek"] = df_tmp.saledate.dt.dayofweek
df_tmp["saleDayofyear"] = df_tmp.saledate.dt.dayofyear

df_tmp.drop("saledate", axis=1, inplace=True)

In [None]:
df_tmp.head().T

In [None]:
df_tmp.Hydraulics.value_counts()

In [None]:
df_tmp.info()

In [None]:
df_tmp.isna().sum()

In [None]:
df_tmp.head().T

In [None]:
pd.api.types.is_string_dtype(df_tmp["UsageBand"])

In [None]:
for label, content in df_tmp.items():
    if pd.api.types.is_string_dtype(content):
        print(label)

In [None]:
random_dict = {"key1": "hello",
               "key2": "world!"}

for key, value in random_dict.items():
    print(f"This is a key: {key}")
    print(f"This is a value: {value}")

In [None]:
for label, content in df_tmp.items():
    if pd.api.types.is_string_dtype(content):
        df_tmp[label] = content.astype("category").cat.as_ordered()

In [None]:
df_tmp.info()

In [None]:
df_tmp.state.cat.categories

In [None]:
df_tmp.state.cat.codes

In [None]:
df_tmp.isnull().sum()/len(df_tmp)

In [None]:
df_tmp.to_csv("train_tmp.csv",
              index=False)

In [None]:
df_tmp = pd.read_csv("train_tmp.csv",
                     low_memory=False)
df_tmp.head().T

In [None]:
df_tmp.info()

In [None]:
df_tmp.isna().sum()

In [None]:
for label, content in df_tmp.items():
    if pd.api.types.is_numeric_dtype(content):
        print(label)

In [None]:
for label, content in df_tmp.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            print(label)

In [None]:
for label, content in df_tmp.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            df_tmp[label+"_is_missing"] = pd.isnull(content)
            df_tmp[label] = content.fillna(content.median())

In [None]:
for label, content in df_tmp.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            print(label)

In [None]:
for label, content in df_tmp.items():
    if not pd.api.types.is_numeric_dtype(content):
        print(label)

In [None]:
for label, content in df_tmp.items():
    if not pd.api.types.is_numeric_dtype(content):
        df_tmp[label+"_is_missing"] = pd.isnull(content)
        df_tmp[label] = pd.Categorical(content).codes+1  

In [None]:
df_tmp.info()

In [None]:
df_tmp.isna().sum()

In [None]:
df_tmp.head().T

In [None]:
from sklearn.ensemble import RandomForestRegressor
# Instantiate model
model = RandomForestRegressor(n_jobs=-1)

In [None]:
%%time
model.fit(df_tmp.drop("SalePrice", axis=1), df_tmp.SalePrice)

In [None]:
model.score(df_tmp.drop("SalePrice", axis=1), df_tmp.SalePrice)

In [None]:
df_tmp.head()

In [None]:
df_tmp.saleYear.value_counts()

In [None]:
df_train = df_tmp[df_tmp.saleYear != 2012]
df_test = df_tmp[df_tmp.saleYear==2012]
len(df_train), len(df_test)

In [None]:
X_train, y_train = df_train.drop("SalePrice", axis=1), df_train.SalePrice
X_test, y_test = df_test.drop("SalePrice", axis=1), df_test.SalePrice

X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
from sklearn.metrics import mean_squared_log_error, mean_absolute_error

def rmsle(y_test, y_preds):
    return np.sqrt(mean_squared_log_error(y_test, y_preds))

# Create function to evaluate our model
def show_scores(model):
    train_preds = model.predict(X_train)
    test_preds = model.predict(X_test)
    scores = {"Training MAE": mean_absolute_error(y_train, train_preds),
              "Test MAE": mean_absolute_error(y_test, test_preds),
              "Training RMSLE": rmsle(y_train, train_preds),
              "Test RMSLE": rmsle(y_test, test_preds),
              "Training R^2": model.score(X_train, y_train),
              "Test R^2": model.score(X_test, y_test)}
    return scores

In [None]:
len(X_train)

In [None]:
model = RandomForestRegressor(n_jobs=-1,
                              max_samples=10000)

In [None]:
%%time
# Cutting down the max number of samples each tree can see improves training time
model.fit(X_train, y_train)

In [None]:
show_scores(model)

In [None]:
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)
X_temp = np.arange(1,21,1)
for train_index, validation_index in tscv.split(X_temp):
    X_temp_train, X_temp_valid,= X_temp[train_index], X_temp[validation_index], 
    print(X_temp_train, X_temp_valid)

In [None]:
%%time
from sklearn.model_selection import RandomizedSearchCV
# Different RandomForestClassifier hyperparameters
rf_grid = {"n_estimators": np.arange(10, 100, 10),
           "max_depth": [None, 3, 5, 10],
           "min_samples_split": np.arange(2, 20, 2),
           "min_samples_leaf": np.arange(1, 20, 2),
           "max_features": [0.5, 1, "sqrt", "auto"],
           "max_samples": [10000]}

tscv = TimeSeriesSplit(n_splits=5)
rs_model = RandomizedSearchCV(RandomForestRegressor(),
                              param_distributions=rf_grid,
                              n_iter=10,
                              cv=tscv,
                              verbose=True,
                             n_jobs=-1)

rs_model.fit(X_train, y_train)

In [None]:
rs_model.best_params_

In [None]:
show_scores(rs_model)

In [None]:
%%time
# Most ideal hyperparameters
ideal_model = RandomForestRegressor(n_estimators=100,
                                    min_samples_leaf=7,
                                    min_samples_split=4,
                                    max_features=0.5,
                                    n_jobs=-1,
                                    max_depth=None,
                                    max_samples=None)
ideal_model.fit(X_train, y_train)

In [None]:
show_scores(ideal_model)

In [None]:
ideal_model.feature_importances_

In [None]:
import seaborn as sns

# Helper function for plotting feature importance
def plot_features(columns, importances, n=20):
    df = (pd.DataFrame({"features": columns,
                        "feature_importance": importances})
          .sort_values("feature_importance", ascending=False)
          .reset_index(drop=True))
    
    sns.barplot(x="feature_importance",
                y="features",
                data=df[:n],
                orient="h")

In [None]:
plot_features(X_train.columns, ideal_model.feature_importances_)

In [None]:
sum(ideal_model.feature_importances_)

In [None]:
df.ProductSize.isna().sum()

In [None]:
df.ProductSize.value_counts()

In [None]:
df.Turbocharged.value_counts()

In [None]:
df.Thumb.value_counts()