Extract

In [None]:
import requests
import pandas as pd

def fetch_and_inspect_api_data():
    url = 'https://api.sportsdata.io/v3/nhl/scores/json/Players?key=3341e08d061e4fcdbbbef942d86962c1'
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()  # Convert the response to JSON
        
        # Convert the data to a pandas DataFrame for inspection
        df = pd.DataFrame(data)
        
        # Inspect the first few rows of the DataFrame
        print(df.head())
        
        return df
    else:
        print("Failed to retrieve data from the API")
        return None

# Now, call the function to fetch the data and inspect it
players_active_df = fetch_and_inspect_api_data()


In [6]:
players_active_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1857 entries, 0 to 1856
Data columns (total 43 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   PlayerID                             1857 non-null   int64  
 1   FirstName                            1857 non-null   object 
 2   LastName                             1857 non-null   object 
 3   Status                               1857 non-null   object 
 4   TeamID                               1857 non-null   int64  
 5   Team                                 1857 non-null   object 
 6   Position                             1857 non-null   object 
 7   Jersey                               800 non-null    float64
 8   Catches                              203 non-null    object 
 9   Shoots                               1651 non-null   object 
 10  Height                               1857 non-null   int64  
 11  Weight                        

Transform

In [15]:
def transform_players_active_df_v2(df):
    # Step 1: Select only the relevant columns. Adjust this list as per your requirement.
    relevant_columns = ['PlayerID', 'FirstName', 'LastName', 'Status', 'TeamID', 'Team', 'Position', 'Height', 'Weight', 'BirthDate', 'BirthCity', 'BirthState']
    df = df[relevant_columns]

    # Step 2: Drop duplicates based on 'PlayerID'
    df.drop_duplicates(subset='PlayerID', inplace=True)

    # Step 3: Combine 'BirthCity' and 'BirthState' into 'BirthPlace'
    # Use a lambda function to handle rows where either city or state might be missing
    df['BirthPlace'] = df.apply(lambda row: ','.join(filter(None, [row['BirthCity'], row['BirthState']])), axis=1)
    
    # Format BirthDate to mm/dd/yyyy
    df['BirthDate'] = pd.to_datetime(df['BirthDate']).dt.strftime('%m/%d/%Y')

    # Now, you can drop the original 'BirthCity' and 'BirthState' columns as they're no longer needed
    df.drop(['BirthCity', 'BirthState'], axis=1, inplace=True)

    # Inspect the transformed DataFrame (Step 4)
    print(df.head())
    print(df.info())

    return df

# Apply the transformation function
players_active_df_transformed = transform_players_active_df_v2(players_active_df)

# This will print the first few rows of the transformed DataFrame and its info, allowing you to inspect it and decide how to proceed.


   PlayerID FirstName    LastName           Status  TeamID Team Position  \
0  30000007     Carey       Price  Injured Reserve       4  MON        G   
1  30000012      Lars       Eller           Active      14  PIT        C   
2  30000015   Brendan   Gallagher           Active       4  MON       RW   
3  30000019       Max  Pacioretty           Active      15  WAS       LW   
4  30000031     Peter     Holland           Minors      19  COL        C   

   Height  Weight   BirthDate      BirthPlace  
0      75     217  08/16/1987  Anahim Lake,BC  
1      74     205  05/08/1989         Rodovre  
2      69     183  05/06/1992     Edmonton,AB  
3      74     217  11/20/1988   New Canaan,CT  
4      74     193  01/14/1991      Toronto,ON  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1857 entries, 0 to 1856
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   PlayerID    1857 non-null   int64 
 1   FirstName   1857 non-nu

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop_duplicates(subset='PlayerID', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['BirthPlace'] = df.apply(lambda row: ','.join(filter(None, [row['BirthCity'], row['BirthState']])), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['BirthDate'] = pd.to_datetime(df['BirthDate']).dt.strftime('%m/%d/%Y')
A value is trying to be s

In [16]:
players_active_df_transformed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1857 entries, 0 to 1856
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   PlayerID    1857 non-null   int64 
 1   FirstName   1857 non-null   object
 2   LastName    1857 non-null   object
 3   Status      1857 non-null   object
 4   TeamID      1857 non-null   int64 
 5   Team        1857 non-null   object
 6   Position    1857 non-null   object
 7   Height      1857 non-null   int64 
 8   Weight      1857 non-null   int64 
 9   BirthDate   1856 non-null   object
 10  BirthPlace  1857 non-null   object
dtypes: int64(4), object(7)
memory usage: 159.7+ KB


In [None]:
import psycopg2

def load_data_to_db(transformed_data):
    conn = psycopg2.connect("dbname=NHL" user=)

In [3]:
# Establish database connection

import psycopg2

def create_connection():
    try:
        conn = psycopg2.connect(
            dbname='NHL',
            user='postgres',
            password='',
            host='localhost'

        )
        print("Successfully connected to the database.")
        return conn
    except psycopg2.Error as e:
        print(f'Error: Could not make connection to PostGreSQL database')
        print(e)

conn = create_connection()

Error: Could not make connection to PostGreSQL database
fe_sendauth: no password supplied

