## Clean the raw data

Load Packages

In [413]:
import pandas as pd
import numpy as np
from datetime import datetime

Read File

In [414]:
path = 'raw.csv'
Russell_data = pd.read_csv(path)

Get rid of row which contains average and delete the "Versus" column

In [415]:
Russell_data = Russell_data[Russell_data['Date'] != "Average"]

Russell_data = Russell_data.drop("Versus", axis=1)

I assueme the 3P% and FT% to be zero if they didn't make any 3PA or FTA

In [416]:
Russell_data['3P%'].fillna(0, inplace=True)
Russell_data['FT%'].fillna(0, inplace=True)

Drop invalid value

In [417]:
Russell_data = Russell_data.dropna()   # Actually, there is no na value.

Split the score to three parts (Win/Lose; Team score; Opponent score)

In [418]:
Russell_data['Win/Lose'] = Russell_data['Score'].str[0]

score_split = Russell_data['Score'].str[1:].str.split('-', expand=True)

Russell_data['Team score'] = score_split[0]

Russell_data['Opponent score'] = score_split[1]

# Drop the original column of score
Russell_data = Russell_data.drop("Score", axis=1)


Split Date

In [419]:
Russell_data['DayOfWeek'] = Russell_data['Date'].str[:3]  # First three letters
Russell_data['date'] = Russell_data['Date'].str[3:]  # Rest of the string
Russell_data = Russell_data.drop("Date", axis=1)

Check data type and make score to be int and removing trailing spaces

In [420]:
Russell_data['Team score'] = Russell_data['Team score'].astype('Int64')
Russell_data['Opponent score'] = Russell_data['Opponent score'].astype('Int64')

for column in Russell_data.select_dtypes(include=['object']):  # Select only columns with object dtype
    Russell_data[column] = Russell_data[column].str.strip()
    

Combine "year" with the "date"

In [421]:
Russell_data['Year'] = Russell_data['Year'].astype(int).astype(str)



Russell_data['date'] = Russell_data['date'] + '/' + Russell_data['Year']
#Russell_data['date'] = Russell_data['date'].str.rstrip('.0')

Russell_data['date'] = pd.to_datetime(Russell_data['date'], format='%m/%d/%Y', errors='coerce')


In [422]:
invalid_dates = Russell_data[Russell_data['date'].isna()]
print(invalid_dates[['date', 'Year']])

Empty DataFrame
Columns: [date, Year]
Index: []


Sort data by date

In [423]:
Russell_data.sort_values(by="date", inplace=True)

Extra verified

In [428]:
years = [2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 
         2018, 2019, 2020, 2021, 2022, 2023, 2024]

for year in years:
    print(year, len(Russell_data[Russell_data["Year"] == year]))

2009 0
2010 0
2011 0
2012 0
2013 0
2014 0
2015 0
2016 0
2017 0
2018 0
2019 0
2020 0
2021 0
2022 0
2023 0
2024 0


In [430]:
unique_years = Russell_data['Year'].unique()
year_counts = Russell_data['Year'].value_counts(sort=False)

print("Unique years in the dataset:", unique_years)
print("Counts for each year:")
print(year_counts)


Unique years in the dataset: ['2009' '2010' '2011' '2012' '2013' '2014' '2015' '2016' '2017' '2018'
 '2019' '2020' '2021' '2022' '2023' '2024']
Counts for each year:
2009    82
2010    88
2011    99
2012    86
2013    84
2014    65
2015    67
2016    98
2017    86
2018    86
2019    78
2020    65
2021    72
2022    78
2023    78
2024    42
Name: Year, dtype: int64


Classify the playoff game and regular season game

In [425]:
# Set the default value for all rows in the new column
Russell_data["Game Type"] = "Regular"

# Initialize a counter for consecutive occurrences
consecutive_count = 1

# Store the index of the first game in the current sequence of games against the same opponent
start_index = None

# Use iterrows to safely iterate over DataFrame rows
for i, row in Russell_data.iterrows():
    if start_index is None:
        start_index = i  # Initialize start_index with the first row's index

    if row['Opponent'] == previous_opponent:
        consecutive_count += 1
        if consecutive_count >= 4:
            # When we have at least 4 consecutive games, mark them as 'Playoff'
            Russell_data.loc[start_index:i, 'Game Type'] = 'Playoff'
    else:
        # Reset the counter and start_index when a new opponent is encountered
        consecutive_count = 1
        start_index = i
        previous_opponent = row['Opponent']

In [426]:
# Assuming 'Russell_data' is your dataframe and is already loaded with data similar to the provided image.

# Group the data by 'Year' and filter for 'Playoff' games, then count the number of such games for each year.
playoff_counts_by_year = Russell_data[Russell_data["Game Type"] == "Playoff"].groupby('Year').size()

# Print the results
for year, count in playoff_counts_by_year.items():
    print(f"{year}: {count} playoff games")



2010: 6 playoff games
2011: 17 playoff games
2012: 20 playoff games
2014: 19 playoff games
2016: 18 playoff games
2017: 5 playoff games
2018: 6 playoff games
2019: 5 playoff games
2020: 5 playoff games
2021: 6 playoff games
2023: 6 playoff games


Create new CSV file


In [427]:
Russell_data.to_csv('Reorgnized_RB.csv', index=False)