In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the data
X_full = pd.read_csv('data.csv')
X_test_full = pd.read_excel('testdata_10%.xlsx')

# Remove rows with missing target, separate target from predictors
X_full.dropna(axis=0, subset=['A16'], inplace=True)
y = X_full.A16
X_full.drop(['A16'], axis=1, inplace=True)

# To keep things simple, we'll use only numerical predictors
X = X_full.select_dtypes(exclude=['object'])
X_test = X_test_full.select_dtypes(exclude=['object'])

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.7, test_size=0.3,
                                                      random_state=0)

In [2]:
X_train.head()

Unnamed: 0,A5,A7,A8,A10,A11,A12,A13
142,2.0,0,True,4.165,True,2,True
529,12.33,458,True,3.5,True,6,False
549,0.83,3290,True,1.335,True,8,True
436,4.25,50,False,3.5,False,0,False
503,2.5,0,False,7.5,True,0,True


In [None]:
# Step 1: Preliminary investigation

In [3]:
# Shape of training data (num_rows, num_columns)
print(X_train.shape)

# Number of missing values in each column of training data
missing_val_count_by_column = (X_train.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

(441, 7)
Series([], dtype: int64)


In [None]:
# This function reports the mean absolute error (MAE) from a random forest model.
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [None]:
# Step 2: Drop columns with missing values

In [None]:
# Fill in the line below: get names of columns with missing values
missing_value_cols = [col for col in X_train.columns if X_train[col].isnull().any()] # Your code here

# Fill in the lines below: drop columns in training and validation data
reduced_X_train = X_train.drop(missing_value_cols, axis=1)
reduced_X_valid = X_valid.drop(missing_value_cols, axis=1)

In [None]:
print("MAE (Drop columns with missing values):")
print(score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid))

In [None]:
# Step 3: Imputation