### score_model.py  
When this is called using python score_model.py in the command line, this will ingest the .pkl random forest file and apply the model to the locally saved scoring dataset csv. There must be data check steps and clear commenting for each step inside the .py file. The output for running this file is a csv file with the predicted score, as well as a png or text file output that contains the model accuracy report (e.g. sklearn's classification report or any other way of model evaluation).

In [1]:
# The following libraries will be used to answer this question.
import pandas as pd
import pickle
import csv
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
# A function to open random forest object
def read_pickle_model(file):
    try:
        # Reads the model pickle file 
        p = open(file, 'rb')
        open_pkl = pickle.load(p)
        
    except:
        # Display error message if there's error opening the file
        print("Issue opening", file)
        
    return(open_pkl)

In [3]:
# Clean_data function from previous section will be used to clean the data.
# The following function cleans the dataset.
def clean_data(df):
    dummy_vars = ['Sex', 'Embarked', 'Pclass']
    drop_vars = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp', 'Parch', 'family_size', 'female', 'C', 1]
    
    # Sex, Passenger Class, Embarked are categorical variables. 
    # Therefore, they are dummified and one of the variable from each are dropped.
    for col in dummy_vars:
        new_vars = pd.get_dummies(df[col])
        df = df.join(new_vars)
    
    # A new variable is created from siblings and parents variables 
    for col in df:
        df['family_size'] = df['SibSp'] + df['Parch'] + 1
        
    for col in df:
        df['is_alone'] = 0
        df.loc[df['family_size'] == 1, 'is_alone'] = 1
    
    # Age variables has several missing values. It will be filled with the mean of the age variable.
    for col in df:
        df['Age'].fillna(df['Age'].mean(), inplace = True)    
        
    # Several variables are dropped as they are not needed or has high number of missing values.    
    df = df.drop(dummy_vars, axis = 1)
    return df.drop(drop_vars, axis = 1)

In [4]:
# The following function applies the model to the locally saved scoring dataset.
def model_report(test, X_test, y_test):
    # loads the datasets
    test = pd.read_csv(test + ".csv")
    X_test = pd.read_csv(X_test + '.csv')
    y_test = pd.read_csv(y_test + '.csv')
    
    # Cleans the dataset to run the model
    X_test.drop(X_test.columns[[0]], axis = 1, inplace = True)
    y_test.drop(y_test.columns[[0]], axis = 1, inplace = True)
    X_test.drop(X_test.index[0], inplace = True)
    
    # Model accuracy report
    accuracy_score = model.score(X_test, y_test)
    
    # Create a csv file of the score and save locally
    score = pd.Series({'score': accuracy_score})
    with open('score.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerow(score)

    # Generate classification report
    class_report = classification_report(y_test, model.predict(X_test))
    
    # Create a text file of the report and save locally
    with open('report.txt', 'w') as f:
        f.write(class_report)
        f.close()
    
    # Also prints the report
    print("\n* Model accuracy score:")
    print(accuracy_score)
    
    print("\n* Classification report:\n")
    print(class_report)  
    

In [5]:
# Test if the function works
model = read_pickle_model('model.pkl')
model_report('test', 'X_test', 'y_test')


* Model accuracy score:
0.7677902621722846

* Classification report:

              precision    recall  f1-score   support

           0       0.83      0.81      0.82       172
           1       0.67      0.69      0.68        95

   micro avg       0.77      0.77      0.77       267
   macro avg       0.75      0.75      0.75       267
weighted avg       0.77      0.77      0.77       267

