# Drive and Instat Visualization

In [3]:
import os
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
json_files = ['../../data/drive/SC700002NCAHOPUN240930SK.json',
              '../../data/drive/SC700003NCAHOPUN241002SK.json',
              '../../data/drive/SC700005NCAHOPUN241008SK.json',
              '../../data/drive/SC700006NCAHOCUN241008SK.json',
              '../../data/drive/SC700008NCAHOPUN241015SK.json',
              '../../data/drive/SC700009NCAHOPUN241022SK.json',
              '../../data/drive/SC700010NCAUSGHO241026MC.json',
              '../../data/drive/SC700011NCAHOPUN241204HA.json']

# Create a dictionary to store DataFrames
dataframes = {}

# Read each JSON file and create a DataFrame
for json_file in json_files:
    if os.path.exists(json_file):
        with open(json_file, 'r') as file:
            data = json.load(file)
            df = pd.DataFrame(data)
           # dataframes[json_file] = drive
    else:
        print(f"File not found: {json_file}")

# Print the DataFrames
for file_name, drive in dataframes.items():
    print(f"DataFrame for {file_name}:")
    df.head()

In [None]:
# Examine Data Structure
print(df.info())
print(f"Number of rows: {df.shape[0]}, Number of columns: {df.shape[1]}")

# Identify Missing Values
print("\nMissing Values per Column:\n", df.isnull().sum())

# Check for Inconsistencies and Outliers (Descriptive Statistics and Visualization)
performance_metrics = ['goals', 'assists', 'shots', 'ice time', 'speed', 'acceleration']
# Filter for numerical columns in performance_metrics and check if they exist in dataframe
numerical_performance_metrics = [col for col in performance_metrics if col in df.columns and pd.api.types.is_numeric_dtype(df[col])]

if numerical_performance_metrics:
    print("\nDescriptive Statistics for Performance Metrics:\n", df[numerical_performance_metrics].describe())
    import matplotlib.pyplot as plt
    df[numerical_performance_metrics].hist(figsize=(10, 6), bins=30)
    plt.tight_layout()
    plt.show()
    df.boxplot(column=numerical_performance_metrics, figsize=(10, 6))
    plt.tight_layout()
    plt.show()
else:
    print("\nNo numerical performance metrics found or all the performance metrics are not numerical.")


# Data Type Analysis
print("\nData Types:\n", df.dtypes)

In [None]:
# Impute missing 'sustained_speed' with the median
if 'sustained_speed' in df.columns:
    df['sustained_speed'].fillna(0, inplace=True)

# Address inconsistencies: Remove rows with non-numeric values in 'speed' and 'acceleration'
for col in ['speed', 'acceleration']:
    if col in df.columns:
        df = df[pd.to_numeric(df[col], errors='coerce').notnull()]
        df[col] = pd.to_numeric(df[col])

# Remove duplicates
df.drop_duplicates(inplace=True)

# Handle outliers using IQR for 'speed' and 'acceleration'
for col in ['speed', 'acceleration']:
    if col in df.columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

display(df.head())
display(df.info())

In [None]:
# Check data types
print(df.dtypes)

# Convert columns to numerical types, handling errors
columns_to_convert = ['speed', 'acceleration', 'ax', 'ay', 'totalDistance', 'displacement', 'skatingAngle', 'speedUp', 'endurance', 'endurance_on', 'speedDown_end', 'speedUp_start', 'gap', 'curvature', 'radius_curvature', 'a_tot', 'a_centripetal', 'g_force', 'g_force_avg', 'g_force_peak', 'sustained_speed', 'toi', 'playerShiftNum']
for col in columns_to_convert:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Print info to confirm data types
print(df.info())