In [1]:
import pandas as pd

# Copy the file path below
data_file_path = ""

# Read the file
file_data = pd.read_csv(data_file_path)

# Describes the each column of the file in 8 sections:
##  Count: shows how many rows have non-missing values
##  Mean: average of data
##  Std: standard deviation, how numerically spread out the values are
##  Min: lowest value
##  Max: highest value
## 25%, 50%, 75% (percentiles)
file_data.describe()

# Obtain a list of all columns in the dataset
file_data.columns

# dropna drops missing values in the column
file_data = file_data.dropna(axis=0)

# Select the predicting target (column) where Column in the name of the target
y = file_data.Column

# Choose features (columns that are later used to make predictions)
data_features = ['Column_1', "...", "Column_n"]
X = file_data[data_features]

# Review data used to make predictions
X.describe()
X.head()

In [None]:
# Building your model

# Use scikit-learn library to create models
# Steps to building and using a model:
## Define: what type of model is it? Other parameters of the model type are specified too
## Fit: Capture patterns from provided data
## Predict
## Evaluate: determine accuracy of the model's predictions

# Example of decision tree model with scikit-learn and feature being target variables
from sklearn.tree import DecisionTreeRegressor

# Define model. Specify a number for random_state to ensure same results each run
data_model = DecisionTreeRegresor(random_state=1)

# Fit model
data_model.fit(X,y)

# Using the fitted model above, we can use it now to make predictions
print("Making predictions for the following ---")
print(X.head())
print("The predictions are:")
print(data_model.predict(X.head()))

In [None]:
# Model Validation

# Once you have a model, calculate the mean absolute error

from sklearn.metrics import mean_absolute_error

predicted_value = data_model.predict(X)
mean_absolute_error(y, predicted_value)

# Problem with 'In-Sample' Scores:
## Data might not be accurate when model sees new data
## To fix this, we can exclude some data from the model-building
## process, and use those to test the model's accuracy on data it
## hasn't seen before. This data is called validation data.

# Use the train_test_split to break up the data into two pieces
from sklearn.model_selection import train_test_split

# Numerical value to random_state argument guarantees we get the same split
# every time we run the script

train_X, val_x, train_y, val_y = train_test_split(X,y, random_state=0)

# Define model
data_model = DecisionTreeRegressor(random_state=1)

# Fit model
data_model.fit(train_X,train_y)

# get predicted values on validation data
val_predictions = data_model.predict(val_X)
print(mean_absolute_error(val_y, val_predictions))

In [None]:
# Underfitting and Overfitting

# Overfitting: where a model matches the training data almost perfectly, but does poorly
#              in validation and other new data.
# Underfitting: when a model fails to capture distinctions and patterns in the data, so it 
#               performs poorly even in training data.

from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    
    model = DecisionTreeRegressor(max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

# compare MAE with differing values of max_leaf_nodes
for max_leaf_nodes in [5.50.500.5000]:
    my_mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
     print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

In [None]:
# Random Forests

# The random forest uses many trees, and it makes a prediction by averaging the predictions of 
# each component tree. It generally has much better predictive accuracy than a single decision tree 
# and it works well with default parameters.

# Build a random forest scikit-learn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_X, train_y)
melb_preds = forest_model.predict(val_X)
print(mean_absolute_error(val_y, melb_preds))