Before heading into data analysis and machine learning, we need to collect clean and process the data first. The source of data is the stats from NBA.com

1.1 Fetching raw HTML

In [None]:
// Adding depencies and importing reqwest:
:dep reqwest = {version = "0.11.16", features = ["blocking"]}
extern crate reqwest;
use reqwest::blocking::Client;

In [None]:
//Defining target url and send request:
let url = "https://www.basketball-reference.com/leagues/NBA_2022.html";
//the blocking feature ensures that the request is synchronous
let response = reqwest::blocking::get(url).expect("Could not load url.");
let raw_html_string = response.text().unwrap();

1.2 Parse the data through CSS selectors

In [None]:
// use scraper crate to parse and query HTML with CSS selectors
:dep scraper = {version = "0.16.0"}
extern crate scraper;
use scraper::Selector;
use scraper::Html;

In [None]:
let html_fragment = Html::parse_fragment(&raw_html_string);

In [None]:
//define some selectors
let table_selector_string = "#advanced-team";
let table_selector = Selector::parse(table_selector_string).unwrap();
let head_elements_selector = Selector::parse("thead>tr>th").unwrap();
let row_elements_selector = Selector::parse("tbody>tr").unwrap();
let row_element_data_selector = Selector::parse("td,th").unwrap();

In [None]:
// pass the table_selector_string to html_fragment to get the reference of all the tables
let all_tables = html_fragment.select(&table_selector);

let mut head: Vec<String> = Vec::new();
let mut rows: Vec<Vec<String>> = Vec::new();

for table in all_tables {
    let head_elements = table.select(&head_elements_selector);
    let mut section_index = 0; // keep track of which section we're processing
    for head_element in head_elements {
        let mut element = head_element.text().collect::<Vec<_>>().join(" ");
        element = element.trim().replace("\n", " ");
        if element.is_empty() {
            // found a section divider, update the section index
            section_index += 1;
        } else {
            // add the header to the appropriate section in the head vector
            let header = format!("{},", element);
            match section_index {
                0 => head.push(header.repeat(4)), // Offense Four Factors
                1 => head.push(header.repeat(5)), // Defense Four Factors
                _ => head.push(header), // other sections
            }
        }
    }
    let row_elements = table.select(&row_elements_selector);
    for row_element in row_elements {
        let mut row: Vec<String> = Vec::new();
        for td_element in row_element.select(&row_element_data_selector) {
            let mut element = td_element.text().collect::<Vec<_>>().join(" ");
            element = element.trim().replace("\n", " ");
            row.push(element);
        }
        rows.push(row);
    }
}
head

zip the header and row values together 

In [None]:
:dep serde_json = {version = "1.0"}
extern crate serde_json;

In [None]:
:dep serde = {version = "1.0.160", features = ["derive"]}
extern crate serde;

In [None]:
:dep indexmap = {version = "1.9.1", features = ["serde"]}
extern crate indexmap;

In [None]:
use indexmap::IndexMap;
let mut table_data: Vec<IndexMap<String, String>> = Vec::new();
for row in rows {
    let zipped_array = head.iter().zip(row.iter()).map(|(a,b)| (a, b)).collect::<Vec<_>>();
    let mut item_hash: IndexMap<String, String> = IndexMap::new();
    for pair in zipped_array {
        if !pair.1.to_string().is_empty() {
            item_hash.insert(pair.0.to_string(), pair.1.to_string());
        }
    }
    table_data.push(item_hash);
}
table_data

1.3 Serializing the data

In [None]:
:dep polars = {version = "0.28.0"}
extern crate polars;

In [None]:
:dep csv = {version = "1.2.1"}
extern crate csv;

In [None]:
:dep json-event-parser = {version = "0.1.1"}
extern crate json_event_parser;

In [None]:
:dep selectors = "0.24.0"
extern crate selectors;

In [None]:
use std::collections::HashMap;
use std::collections::BTreeMap;
use scraper::Element;
use scraper::{Html, Selector};

/*
The reason why the header elements are not being parsed in order is because the table cells in the HTML are not necessarily arranged in a left-to-right order.

When the select method is used to retrieve the head_elements of the table, the order in which the cells are returned is determined by the order in which they appear in the HTML source, which may not correspond to the left-to-right order of the cells in the table.

To address this issue, you can modify the code to explicitly specify the order in which the header elements should be added to the head vector, based on their position in the table. 
*/
fn scrape_table_data(html_fragment: &Html, table_selector_string: &str, head_elements_selector: &Selector, row_elements_selector: &Selector, row_element_data_selector: &Selector) -> Vec<BTreeMap<String, String>> {
    let table_selector = Selector::parse(table_selector_string).unwrap();
    let all_tables = html_fragment.select(&table_selector);
    
    let mut head: Vec<String> = Vec::new();
    let mut rows: Vec<Vec<String>> = Vec::new();

    for (_table_index, table) in all_tables.enumerate() {
        let head_elements = table.select(&head_elements_selector);
        for head_element in head_elements {
            let element = head_element.value(); // Get the underlying Element
            // Skip the header row
            if element.attr("class").unwrap_or("") == "over_header" {
                continue;
            }
            let mut element = head_element.text().collect::<Vec<_>>().join(" ");
            element = element.trim().replace("\n", " ");
            head.push(element);
        }
        let row_elements = table.select(&row_elements_selector); // skip the first row
        for row_element in row_elements {
            let mut row: Vec<String> = Vec::new();
            for td_element in row_element.select(&row_element_data_selector) {
                let mut element = td_element.text().collect::<Vec<_>>().join(" ");
                element = element.trim().replace("\n", " ");
                row.push(element);
            }
            rows.push(row);
        }
    }

    let mut table_data: Vec<BTreeMap<String, String>> = Vec::new();
    for row in rows {
        let zipped_array = head.iter().zip(row.iter()).map(|(a,b)| (a, b)).collect::<Vec<_>>();
        let mut item_hash: BTreeMap<String, String> = BTreeMap::new();
        for pair in zipped_array {
            if !pair.1.to_string().is_empty() {
                item_hash.insert(pair.0.to_string(), pair.1.to_string());
            }
        }
        table_data.push(item_hash);
    }

    table_data
}


In [None]:
//serialize the table_data into a csv file
use polars::prelude::*;
use std::fs::File;
use serde::Serialize;
use indexmap::IndexMap;
use csv::{Writer, WriterBuilder};
use std::error::Error;
use std::collections::HashMap;
use std::collections::BTreeMap;
use std::io::Write;
//use select::document::Document;
//use select::predicate::{Predicate, Attr, Name};
use json_event_parser::JsonReader

let mut tables_data: Vec<(String, Vec<BTreeMap<String, String>>)> = Vec::new();

for year in 2010..=2022 {
    let url = format!("https://www.basketball-reference.com/leagues/NBA_{}.html", year);
    let response = reqwest::blocking::get(&url).expect("Could not load url.");
    let raw_html_string = response.text().unwrap();
    let html_fragment = Html::parse_fragment(&raw_html_string);
    let table_selector_string = "#advanced-team";
    let head_elements_selector = Selector::parse("thead>tr>th").unwrap();
    let row_elements_selector = Selector::parse("tbody>tr").unwrap();
    let row_element_data_selector = Selector::parse("td,th").unwrap();
    let table_data = scrape_table_data(&html_fragment, &table_selector_string, &head_elements_selector, &row_elements_selector, &row_element_data_selector);
    tables_data.push((format!("{}", year), table_data));
}

// Open a file to write the CSV output to
let file = File::create("output.csv")?;

// Create a CSV writer using the default configuration
let mut csv_writer = WriterBuilder::new().from_writer(file);

// Write the header row
let headers = tables_data[0].1[0].keys().cloned().collect::<Vec<_>>();
csv_writer.write_record(&headers)?;

// Write each row of data
for (_table_name, rows) in tables_data.iter() {
    for row in rows {
        let values = headers.iter().map(|h| row.get(h).unwrap()).collect::<Vec<_>>();
        csv_writer.write_record(&values)?;
    }
}

// Flush the CSV writer to ensure all data is written to the file
csv_writer.flush()?;


/*
let mut tables_data: IndexMap<String, Vec<HashMap<String, String>>> = IndexMap::new();
for year in 2010..=2011 {
    let url = format!("https://www.basketball-reference.com/leagues/NBA_{}.html", year);
    let response = reqwest::blocking::get(&url).expect("Could not load url.");
    let raw_html_string = response.text().unwrap();
    let html_fragment = Html::parse_fragment(&raw_html_string);
    let table_selector_string = "#per_game-team";
    let head_elements_selector = Selector::parse("thead>tr>th").unwrap();
    let row_elements_selector = Selector::parse("tbody>tr").unwrap();
    let row_element_data_selector = Selector::parse("td,th").unwrap();
    let table_data = scrape_table_data(&html_fragment, &table_selector_string, &head_elements_selector, &row_elements_selector, &row_element_data_selector);
    tables_data.insert(format!("{}", year), table_data);
}

// Open a file to write the CSV output to
let file = File::create("output.csv")?;

// Create a CSV writer using the default configuration
let mut csv_writer = WriterBuilder::new().from_writer(file);

// Write the header row
let headers = tables_data.values().next().unwrap()[0].keys();
csv_writer.write_record(headers)?;

// Write each row of data
for (table_name, rows) in tables_data.iter() {
    for row in rows {
        let values = row.values();
        csv_writer.write_record(values)?;
    }
}

//define a struct container for the tables_data
#[derive(Serialize)]
struct FinalTable {
    tables: IndexMap<String, Vec<HashMap<String, String>>>,
}
let final_table = FinalTable{tables: tables_data};
let json = serde_json::to_string_pretty(&final_table).unwrap();

let path = "scraped_raw_data.json";
let mut output = File::create(path).unwrap();
let result: Result<(), std::io::Error> = output.write_all(json.as_bytes());

match result {
    Ok(()) => println!("hooray"),
    Err(e) => println!("uhhhh"),
}
println!("{json:#}");
*/



In [None]:
// helper function to turn the data in the csv file into dataframe
fn read_csv(file_path: &str) -> PolarsResult<DataFrame> {
    CsvReader::from_path(file_path)?.has_header(true).finish()
}
let df = read_csv("output.csv");
df

Using Logistic Regression Model to predict the ELO ratings

In [None]:
:dep smartcore = {version = "0.2.0", features = ["nalgebra-bindings", "datasets"]}
extern crate smartcore;

In [None]:
:dep nalgebra = {version = "0.23.0", features = ["serde-serialize"]}
extern crate nalgebra;

In [None]:
:dep argmin = "*"
extern crate argmin;

First Load the dataset and split it into tests and training sets. 

In [None]:
use std::fs::File;
let file = File::open("cleaned_dataset.csv").unwrap();
let cerdit: DMatrix<f64> = parse_csv(BufReader::new(file)).unwrap();

let x = credit.columns(0, 30)

In [None]:
//helper function to calculate the ELO ratrings
use std::collections::HashMap;
use csv::Reader;

// ELO constants
const K: f64 = 32.0;
const BASE_ELO: f64 = 1500.0;

// Struct to hold team data
struct Team {
    elo: f64,
    wins: u32,
    losses: u32,
}

// Function to read CSV file and calculate ELO ratings
fn calculate_elo(file_path: &str) -> HashMap<String, f64> {
    // Open CSV file and create CSV reader
    let file = std::fs::File::open(file_path).unwrap();
    let mut reader = Reader::from_reader(file);

    // Create hash map to store team data
    let mut teams: HashMap<String, Team> = HashMap::new();

    // Initialize all teams with base ELO rating
    for result in reader.records() {
        let record = result.unwrap();
        let team_name = record.get(0).unwrap().to_string();
        let team = Team {
            elo: BASE_ELO,
            wins: record.get(1).unwrap().parse().unwrap(),
            losses: record.get(2).unwrap().parse().unwrap(),
        };
        teams.insert(team_name, team);
    }

    // Calculate ELO ratings
    for (team_name, team) in teams.iter_mut() {
        for (opponent_name, opponent) in teams.iter() {
            if team_name == opponent_name {
                continue;
            }
            let expected_score = 1.0 / (1.0 + 10.0f64.powf((opponent.elo - team.elo) / 400.0));
            let actual_score = team.wins as f64 / (team.wins + opponent.wins) as f64;
            let elo_delta = K * (actual_score - expected_score);
            team.elo += elo_delta;
        }
    }

    // Create hash map of team names and ELO ratings
    let mut elo_ratings: HashMap<String, f64> = HashMap::new();
    for (team_name, team) in teams {
        elo_ratings.insert(team_name, team.elo);
    }

    elo_ratings
}