In [1]:
from csv import reader
from ast import literal_eval
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
import numpy as np
from sklearn import utils
from sklearn.model_selection import KFold 

In [2]:
def parse_matrix_into_vector( data):
    matrix = literal_eval(data)
    row = []
    for i in range(0, len(matrix)):
        for j in range(0, len(matrix[0])):
            row.append(matrix[i][j])
    return row
    
assert parse_matrix_into_vector('[[0, 1, 2], [3, 4, 5], [6, 7, 8]]')  == [0, 1, 2, 3, 4, 5, 6, 7, 8]  

In [3]:
def parse_str_to_list( data):
    return 99 if len(literal_eval(data)) == 2 else int(literal_eval(data)[0])
    
assert parse_str_to_list("[1]") == 1
assert parse_str_to_list("[1, 2]") == 99

In [4]:
def add_dims_as_cols(adict, keyaffix, data):
    for i in range(0, len(data)):
        adict[f"{keyaffix}_{i}"] = data[i]
    return adict
        
assert add_dims_as_cols({}, "prev", [0, 0, 0, 0, 1, 0, 0, 0, 0]) == {'prev_0': 0,
 'prev_1': 0,
 'prev_2': 0,
 'prev_3': 0,
 'prev_4': 1,
 'prev_5': 0,
 'prev_6': 0,
 'prev_7': 0,
 'prev_8': 0}

In [5]:
# Data munging

In [6]:
def read_parse_csv(fname):
    # open file in read mode
    all_csv = []
    with open(fname, "r") as read_obj:
        # pass the file object to reader() to get the reader object
        csv_reader = reader(read_obj)
        # Iterate over each row in the csv using reader object
        for row in csv_reader:
            # row variable is a list that represents a row in csv
            r = {
                "prev_state": literal_eval(row[0]),
                "prev_vect": parse_matrix_into_vector( row[0]),
                "curr_state": literal_eval(row[1]),
                "curr_vect": parse_matrix_into_vector( row[1]),
                "move_by": int(row[2]),
                "outcome": parse_str_to_list(row[3])
            }
            r = add_dims_as_cols(r, "prev", r["prev_vect"])
            r = add_dims_as_cols(r, "curr", r["curr_vect"])
            all_csv.append(r)
    return all_csv
        


In [7]:
def generate_df(listofdicts):
    df = pd.DataFrame(listofdicts)
    df = df.drop("prev_state", 1)
    df = df.drop("curr_state", 1)
    df = df.drop("prev_vect", 1)
    df = df.drop("curr_vect", 1)
    return df

In [None]:
# Model building

In [8]:
def build_model(df, X, y, n_estimators):
    # Split dataset into training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) 
    #Create a Gaussian Classifier
    clf=RandomForestClassifier(n_estimators)

    #Train the model using the training sets y_pred=clf.predict(X_test)
    clf.fit(X_train,y_train)

    # Predict
    y_pred=clf.predict(X_test)
    
    print(f"F1: {round(f1_score(y_test, y_pred, average='macro'), 2)}")
    print(f"precision: {round(precision_score(y_test, y_pred, average='macro'), 2)}")
    print(f"recall: {round(recall_score(y_test, y_pred, average='macro'), 2)}")
    print(f"accuracy (train): {round(clf.score(X_train,y_train), 2)}")
    print(f"accuracy (test): {round(clf.score(X_test,y_test), 2)}")
    return clf

In [9]:
# K-fold

In [11]:
# Implementing cross validation
 
def run_kfold_cross_validation(df, X, y, num_k):
    k = num_k
    kf = KFold(n_splits=k, random_state=None)
    model=RandomForestClassifier(n_estimators=100)
    acc_scores, f1_scores, precision_scores, recall_scores  = [], [], [], []

    for train_index , test_index in kf.split(X):
        X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
        y_train , y_test = y[train_index] , y[test_index]

        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)

        acc = accuracy_score(y_pred , y_test)
        f1 = f1_score(y_test, y_pred, average='macro')
        precision = precision_score(y_test, y_pred, average='macro')
        recall = recall_score(y_test, y_pred, average='macro')
        acc_scores.append(acc)
        f1_scores.append(f1)
        precision_scores.append(precision)
        recall_scores.append(recall)

    avg_acc_score = round(sum(acc_scores)/k, 2)
    avg_f1_score = round(sum(f1_scores)/k, 2)
    avg_prec_score = round(sum(precision_scores)/k, 2)
    avg_recall_score = round(sum(recall_scores)/k, 2)
    return {
        "avg_acc_score": avg_acc_score,
        "avg_f1_score": avg_f1_score,
        "avg_prec_score": avg_prec_score,
        "avg_recall_score": avg_recall_score
    }

In [14]:
def run_rf_3x3():
    fname = "rf_3x3_data.csv"
    csvdicts = read_parse_csv(fname)
    df = generate_df(csvdicts)
    print("Build model")
    build_model(df, X=df.iloc[:, np.r_[0, 2:20]], y=df["outcome"], n_estimators=100)
    print("\nK-Fold Cross-Validation")
    print(run_kfold_cross_validation(df, X=df.iloc[:, np.r_[0, 2:20]], y=df["outcome"], num_k=10))

run_rf_3x3()

  df = df.drop("prev_state", 1)
  df = df.drop("curr_state", 1)
  df = df.drop("prev_vect", 1)
  df = df.drop("curr_vect", 1)


Build model
F1: 0.93
precision: 0.98
recall: 0.9
accuracy (train): 1.0
accuracy (test): 0.98

K-Fold Cross-Validation


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'avg_acc_score': 0.98, 'avg_f1_score': 0.92, 'avg_prec_score': 0.95, 'avg_recall_score': 0.9}
