In [1]:
from pathlib import Path
import sys

sys.path.append(str(Path().absolute().parent))
# sys.path

## Ingestion

Includes:
- Requesting `playlistItems`
- Polling
- Making and executing the request
- Storing the raw information

### Requesting Playlist Items

Concerned with making the request. The next section includes functions for logging requests that the `request_playlist` function can use.

Polling should be last. It uses this and the next section as building blocks. Also transitions perfectly into transformation, as the JSON files need to be compiled.

In [5]:
import api_key
import json

# need to actually send the request to youtube. params filled out, except for pageToken. logging handled in next section
def request_playlist(youtube, maxResults=50, pageToken=""):
    '''
    Request all information on maxResults number of playlistItems from IGN game reviews playlist using youtube and a specific pageToken.    
    
    If supplied with just youtube, effectively just reads 50 items from the playlist at a time.
    '''
    # Note that in just about every case you would want maxResults to be 50
    part='contentDetails,id,snippet,status' # static
    playlistId="PLraFbwCoisJBTl0oXn8UoUam5HXWUZ7ES" # static
    
    params = {
        "part": part,
        "playlistId": playlistId,
        "maxResults": maxResults,
        "pageToken": pageToken
    }
    
    request = youtube.playlistItems().list(**params)
    request_number = generate_request_number()
    
    time_sent = get_current_time()
    response = request.execute()
    time_received = get_current_time()
    
    # log to ./db/response.csv
    md = {}
    md["requestNumber"] = request_number
    md["timeSent"] = time_sent
    md["timeReceived"] = time_received
    md["params"] = {**params}
    append_response_csv(md)
    
    # log to raw, a directory of jsons. save the conversion from json to csv in the transformation stage.
    response["requestNumber"] = request_number
    with open(f"./raw/{response['requestNumber']}.json", "w") as fp:
        json.dump(response, fp)
    
    return response

### Log responses from `request_playlist`

The following information are recorded when making and receiving a request and response:
- `requestNumber`
- `timeSent`
- `timeReceived`
- `params.part`
- `params.playlistId`
- `params.maxResults`
- `params.pageToken`

Logging also helps with handing out request numbers.

In [6]:
# need a place to log info

import pandas as pd

# init/reset response.csv. careful with this function.
def reset_response_csv(target="./db/response.csv"):
    '''
    Writes to target csv a dataframe with only columns/headers, no rows.
    Returns the dataframe used to write to the csv.
    
    Each record is a request made and received from the request_playlist() function.
    '''
    # dummy for keys, copied over from request_playlist()
    params = {
        "part": 1,
        "playlistId": 2,
        "maxResults": 3,
        "pageToken": 4
    }
    headers_df = pd.DataFrame(columns=[
        'requestNumber',
        'timeSent',
        'timeReceived',
        *[f'params.{k}' for k in params.keys()]
    ])
    
    headers_df.to_csv(target, index=False)
    return headers_df

# edit response.csv
def append_response_csv(data_dict, target="./db/response.csv"):
    df = pd.json_normalize(data_dict)
    df.to_csv(target, index=False, mode='a', header=False)
    return df

# add time requested to the received response
from datetime import datetime, timezone

# just use utc time. "central"
def get_current_time(timezone=timezone.utc):
    now = datetime.now(timezone)
    return now

def serialize_time(dt):
    return dt.isoformat()

def deserialize_datetime_string(dt_string):
    return dt_string.fromisoformat()

# add request number
def generate_request_number(source="./db/response.csv"):
    request_numbers = pd.read_csv(source)['requestNumber']
    generated = request_numbers.max() + 1
    if pd.isna(generated):
        # na + 1 = na
        generated = 0
    return generated

# md = {}
# md["requestNumber"] = 1
# md["timeSent"] = 0
# md["timeReceived"] = 2
# md["params"] = {**params}

# append_response_csv(md)

### Polling

Part ingestion, as requests will need to be requested from youtube and stored in the `raw` folder.
Part transformation, as will need to compile a list of old video ids to see if there are new videos in the playlist.

In [116]:
# just need a function that see if there's a new video in the playlist that's not in the playlist.csv file

# converting dictionary to df just makes querying so much nicer, as seen in request_video_id.isin(old_video_id)
def playlist_response_to_df(playlist_dictionary):
    '''
    Assume that playlist_dictionary is in the form of a response from youtube.playlistItems().list() with all parts.
    Also assume that each response has a responseNumber attached to it.
    
    Effectively allows structured searching rather than document search.
    '''
    df = pd.json_normalize(playlist_dictionary,
                           record_path='items',
                           meta=['kind', 'etag', 'nextPageToken', 'pageInfo', 'requestNumber'],
                           meta_prefix='meta.',
                           errors='ignore'
                          )
    return df

def poll_playlist():
    '''
    Returns whether there's at least one new video in the playlist, and the response and mask used to evaluate that.
    
    If all video ids are new, request again. Otherwise, just return the new ones.
    Should work when there are no rows (besides header) in the playlist.csv.
    TODO: figure out how to consume
    '''
    # we're going to be comparing video ids
    old_video_ids = pd.read_csv('./db/playlist.csv')['snippet.resourceId.videoId']
    
    youtube = api_key.get_youtube()
    response = request_playlist(youtube)
    requested_video_ids = playlist_response_to_df(response)['snippet.resourceId.videoId']
    print(len(requested_video_ids), end='\r')
    
    # if all the requested ones are new, then check the next page
    are_old = requested_video_ids.isin(old_video_ids)
    nextPageToken = response.get('nextPageToken')
    while (~are_old).all() and nextPageToken:
        # make another request
        response = request_playlist(youtube, pageToken=nextPageToken)
        requested_video_ids = playlist_response_to_df(response)['snippet.resourceId.videoId']
        are_old = requested_video_ids.isin(old_video_ids)
        nextPageToken = response.get('nextPageToken')
        print(len(requested_video_ids), end='\r')
    
    has_new_videos = ~(are_old.all()) # if all are old, then there's no new videos. 
    
    return has_new_videos

In [117]:
# comment below to see polling
# if ./db/playlist.csv is empty (except for header), should currently display around ~2100-2200 videos
# the same number as the number of videos in the playlist
# poll_playlist()

## Transformation

Includes:
- Consuming
- Consolidating all the raw JSON files into one CSV
- Unnesting dicts

### About combining JSON files

Process:
1. JSON to DataFrame
2. Concat (or stack) DataFrame with previous files
3. Write DataFrame to CSV

Step 0 is to initialize the csv file with, say, the first response.

In [118]:
# helper functions for consume_raw
import json

def gather_jsons_to_df(directory='./raw'):
    '''
    Test below.
    # big_df = consolidate_json()
    # big_df.shape
    '''
    big_df = pd.DataFrame()
    to_concat = [big_df]
    
    json_files = Path(directory).glob("*.json")
    for j in json_files:
        df = json_to_df(j)
        to_concat.append(df)
        
    big_df = pd.concat(to_concat, axis=0)    
    return big_df

def json_to_df(path):
    '''
    Tests below.
    # df = json_to_df('./raw/0.json')
    # print(df.shape)
    # print(df['snippet.resourceId.videoId'][:2])
    # df.head(2)

    # paths = Path('./raw').glob("*.json")
    # json_to_df(list(paths)[0])
    '''
    with open(path, 'r') as fp:
        response_dict = json.load(fp)
        df = playlist_response_to_df(response_dict)
        
    return df

In [82]:
videoId_key = 'snippet.resourceId.videoId'
old_video_ids = pd.read_csv("./db/playlist.csv")[videoId_key]
old_video_ids

Series([], Name: snippet.resourceId.videoId, dtype: object)

In [113]:
def consume_raw(directory='./raw'):
    # would like a reset db playlist function
    # append db playlist if necessary
    # move to consumed folder
    
    # first gather them all together, so can perform batch comparison operations in memory
    big_df = gather_jsons_to_df(directory)
    old_df = pd.read_csv("./db/playlist.csv")
    
    # parse if snippet.resourceId.videoId are new
    videoId_key = 'snippet.resourceId.videoId'
    
    received_video_ids = big_df[videoId_key]
    old_video_ids = old_df[videoId_key]
    mask = ~(received_video_ids.isin(old_video_ids))
    
    # for now, concat playlist.csv 
    to_concat = big_df[mask]
    
    # read each file, convert to df, parse if it's actually new, write to csv
    
    return

In [None]:
# the function to run at 3 am every Friday
def update_db_playlist():
    has_new_vids, mask, playlist_items = poll_playlist()
    if has new_vids:
        response_df = playlist_response_to_df(playlist_items)
        new_vids = response_df[mask]
        
        # concat to 

In [35]:
new_vids = pd.DataFrame()
if has_new_vids:
    response_df = convert_playlist_item_to_df(playlist_items)
    new_vids = response_df[mask]
    
new_vids

Unnamed: 0,kind,etag,id,snippet.publishedAt,snippet.channelId,snippet.title,snippet.description,snippet.thumbnails.default.url,snippet.thumbnails.default.width,snippet.thumbnails.default.height,...,snippet.resourceId.videoId,snippet.videoOwnerChannelTitle,snippet.videoOwnerChannelId,contentDetails.videoId,contentDetails.videoPublishedAt,status.privacyStatus,meta.kind,meta.etag,meta.nextPageToken,meta.pageInfo
0,youtube#playlistItem,Q1D-3Jq56ohEvfzNItyJ1LXvv4g,UExyYUZid0NvaXNKQlRsMG9YbjhVb1VhbTVIWFdVWjdFUy...,2023-12-04T18:29:57Z,UCKy1dAqELo0zrOtPkf0eTMw,PlayStation Access Controller Review,The PlayStation Access Controller software set...,https://i.ytimg.com/vi/J2J79O5ItSk/default.jpg,120.0,90.0,...,J2J79O5ItSk,IGN,UCKy1dAqELo0zrOtPkf0eTMw,J2J79O5ItSk,2023-12-04T18:29:58Z,public,youtube#playlistItemListResponse,FilcTWfmuyvSlRAFYtTYj48FfVA,EAAaL1BUOkNESWlFRUl4TVRKQk56QXhNa0UxUTBFM1JEZ2...,"{'totalResults': 2180, 'resultsPerPage': 50}"
1,youtube#playlistItem,qGIjm1MI_seDg0xcAel1qcumuj8,UExyYUZid0NvaXNKQlRsMG9YbjhVb1VhbTVIWFdVWjdFUy...,2023-12-01T20:41:50Z,UCKy1dAqELo0zrOtPkf0eTMw,The Walking Dead: Destinies Review,The Walking Dead: Destinies reviewed on PlaySt...,https://i.ytimg.com/vi/3r0PnCUMvJ0/default.jpg,120.0,90.0,...,3r0PnCUMvJ0,IGN,UCKy1dAqELo0zrOtPkf0eTMw,3r0PnCUMvJ0,2023-12-01T20:43:33Z,public,youtube#playlistItemListResponse,FilcTWfmuyvSlRAFYtTYj48FfVA,EAAaL1BUOkNESWlFRUl4TVRKQk56QXhNa0UxUTBFM1JEZ2...,"{'totalResults': 2180, 'resultsPerPage': 50}"


In [4]:
youtube = api_key.get_youtube()
response = request_playlist(youtube)
response

{'kind': 'youtube#playlistItemListResponse',
 'etag': 'AGjm-pXbCNc4AdtXW3mOQodpZSs',
 'nextPageToken': 'EAAaL1BUOkNESWlFRUl4TVRKQk56QXhNa0UxUTBFM1JEZ29BVWp3MVl2eHRQYUNBMUFC',
 'items': [{'kind': 'youtube#playlistItem',
   'etag': 'Q1D-3Jq56ohEvfzNItyJ1LXvv4g',
   'id': 'UExyYUZid0NvaXNKQlRsMG9YbjhVb1VhbTVIWFdVWjdFUy5GRUEzMkJEODIxOUQ0N0RC',
   'snippet': {'publishedAt': '2023-12-04T18:29:57Z',
    'channelId': 'UCKy1dAqELo0zrOtPkf0eTMw',
    'title': 'PlayStation Access Controller Review',
    'description': 'The PlayStation Access Controller software settings are revolutionary for accessible hardware, but its physical design prevents access to most buttons for physically disabled players with limited reach, atrophied hands, or limited strength.\n\n#IGN #Gaming #PS5',
    'thumbnails': {'default': {'url': 'https://i.ytimg.com/vi/J2J79O5ItSk/default.jpg',
      'width': 120,
      'height': 90},
     'medium': {'url': 'https://i.ytimg.com/vi/J2J79O5ItSk/mqdefault.jpg',
      'width': 320

In [5]:
response.keys()

dict_keys(['kind', 'etag', 'nextPageToken', 'items', 'pageInfo'])

In [14]:
old_df = pd.read_csv('./db/playlist.csv')
old_df.head(2)

Unnamed: 0,kind,etag,id,snippet.publishedAt,snippet.channelId,snippet.title,snippet.description,snippet.thumbnails.default.url,snippet.thumbnails.default.width,snippet.thumbnails.default.height,...,snippet.resourceId.videoId,snippet.videoOwnerChannelTitle,snippet.videoOwnerChannelId,contentDetails.videoId,contentDetails.videoPublishedAt,status.privacyStatus,meta.kind,meta.etag,meta.nextPageToken,meta.pageInfo
0,youtube#playlistItem,q191DwQJ3wvvueDrSGWosqIBW90,UExyYUZid0NvaXNKQlRsMG9YbjhVb1VhbTVIWFdVWjdFUy...,2023-11-30T00:58:29Z,UCKy1dAqELo0zrOtPkf0eTMw,SteamWorld Build Review,SteamWorld Build reviewed by Jon Bolding on PC...,https://i.ytimg.com/vi/D4HMQ72YFSw/default.jpg,120.0,90.0,...,D4HMQ72YFSw,IGN,UCKy1dAqELo0zrOtPkf0eTMw,D4HMQ72YFSw,2023-11-30T01:00:43Z,public,youtube#playlistItemListResponse,I0myo1_FOBPYpac5a9VYovdGA_A,EAAaL1BUOkNESWlFRVF6TmpaR1FqUTFNa00wTVVFMFJVWW...,"{'totalResults': 2178, 'resultsPerPage': 50}"
1,youtube#playlistItem,cAjwMN-Zc21O_2gyZs5PBKh_4_M,UExyYUZid0NvaXNKQlRsMG9YbjhVb1VhbTVIWFdVWjdFUy...,2023-11-29T22:17:57Z,UCKy1dAqELo0zrOtPkf0eTMw,Lethal Company Early Access Review,Lethal Company reviewed in Early Access by Gab...,https://i.ytimg.com/vi/jK9rWsJcZAM/default.jpg,120.0,90.0,...,jK9rWsJcZAM,IGN,UCKy1dAqELo0zrOtPkf0eTMw,jK9rWsJcZAM,2023-11-29T22:17:57Z,public,youtube#playlistItemListResponse,I0myo1_FOBPYpac5a9VYovdGA_A,EAAaL1BUOkNESWlFRVF6TmpaR1FqUTFNa00wTVVFMFJVWW...,"{'totalResults': 2178, 'resultsPerPage': 50}"


In [15]:
import pandas as pd
import json

def convert_json_to_df(pathname):
    '''
    Takes in a path to a response from youtube.playlistItems().list()
    
    '''
    with open(pathname, "r") as fp:
        j = json.load(fp)
        df = pd.json_normalize(j, record_path='items', meta=['kind', 'etag', 'nextPageToken', 'pageInfo'], meta_prefix="meta.", errors='ignore')
        
    return df

# read a json file
json_pathnames = [str(f) for f in Path("./raw").glob("*.json")]
json_pathnames.sort()

test_json = json_pathnames[0]
test_json

'raw/0.json'

In [16]:
convert_json_to_df(test_json).head(1)

Unnamed: 0,kind,etag,id,snippet.publishedAt,snippet.channelId,snippet.title,snippet.description,snippet.thumbnails.default.url,snippet.thumbnails.default.width,snippet.thumbnails.default.height,...,snippet.resourceId.videoId,snippet.videoOwnerChannelTitle,snippet.videoOwnerChannelId,contentDetails.videoId,contentDetails.videoPublishedAt,status.privacyStatus,meta.kind,meta.etag,meta.nextPageToken,meta.pageInfo
0,youtube#playlistItem,q191DwQJ3wvvueDrSGWosqIBW90,UExyYUZid0NvaXNKQlRsMG9YbjhVb1VhbTVIWFdVWjdFUy...,2023-11-30T00:58:29Z,UCKy1dAqELo0zrOtPkf0eTMw,SteamWorld Build Review,SteamWorld Build reviewed by Jon Bolding on PC...,https://i.ytimg.com/vi/D4HMQ72YFSw/default.jpg,120.0,90.0,...,D4HMQ72YFSw,IGN,UCKy1dAqELo0zrOtPkf0eTMw,D4HMQ72YFSw,2023-11-30T01:00:43Z,public,youtube#playlistItemListResponse,I0myo1_FOBPYpac5a9VYovdGA_A,EAAaL1BUOkNESWlFRVF6TmpaR1FqUTFNa00wTVVFMFJVWW...,"{'totalResults': 2178, 'resultsPerPage': 50}"


In [17]:
# write the first one in so headers are in place
df = convert_json_to_df(test_json)
df.to_csv("./db/playlist.csv", index=False)
# TODO:
# function for clearing rows, except headers

In [23]:
playlist_csv_dir = "./db/playlist.csv"
big_df = pd.read_csv(playlist_csv_dir)
for j in json_pathnames:
    df = convert_json_to_df(j)
    big_df = pd.concat([big_df, df], axis=0)
    
big_df.shape

(2178, 36)

In [24]:
big_df.to_csv(playlist_csv_dir, index=False)

In [25]:
# test if saving to csv worked by trying to read back from the csv
pd.read_csv(playlist_csv_dir).head()

Unnamed: 0,kind,etag,id,snippet.publishedAt,snippet.channelId,snippet.title,snippet.description,snippet.thumbnails.default.url,snippet.thumbnails.default.width,snippet.thumbnails.default.height,...,snippet.resourceId.videoId,snippet.videoOwnerChannelTitle,snippet.videoOwnerChannelId,contentDetails.videoId,contentDetails.videoPublishedAt,status.privacyStatus,meta.kind,meta.etag,meta.nextPageToken,meta.pageInfo
0,youtube#playlistItem,q191DwQJ3wvvueDrSGWosqIBW90,UExyYUZid0NvaXNKQlRsMG9YbjhVb1VhbTVIWFdVWjdFUy...,2023-11-30T00:58:29Z,UCKy1dAqELo0zrOtPkf0eTMw,SteamWorld Build Review,SteamWorld Build reviewed by Jon Bolding on PC...,https://i.ytimg.com/vi/D4HMQ72YFSw/default.jpg,120.0,90.0,...,D4HMQ72YFSw,IGN,UCKy1dAqELo0zrOtPkf0eTMw,D4HMQ72YFSw,2023-11-30T01:00:43Z,public,youtube#playlistItemListResponse,I0myo1_FOBPYpac5a9VYovdGA_A,EAAaL1BUOkNESWlFRVF6TmpaR1FqUTFNa00wTVVFMFJVWW...,"{'totalResults': 2178, 'resultsPerPage': 50}"
1,youtube#playlistItem,cAjwMN-Zc21O_2gyZs5PBKh_4_M,UExyYUZid0NvaXNKQlRsMG9YbjhVb1VhbTVIWFdVWjdFUy...,2023-11-29T22:17:57Z,UCKy1dAqELo0zrOtPkf0eTMw,Lethal Company Early Access Review,Lethal Company reviewed in Early Access by Gab...,https://i.ytimg.com/vi/jK9rWsJcZAM/default.jpg,120.0,90.0,...,jK9rWsJcZAM,IGN,UCKy1dAqELo0zrOtPkf0eTMw,jK9rWsJcZAM,2023-11-29T22:17:57Z,public,youtube#playlistItemListResponse,I0myo1_FOBPYpac5a9VYovdGA_A,EAAaL1BUOkNESWlFRVF6TmpaR1FqUTFNa00wTVVFMFJVWW...,"{'totalResults': 2178, 'resultsPerPage': 50}"
2,youtube#playlistItem,Kiig37DtnmX5IjVMvwF7Zj73mV4,UExyYUZid0NvaXNKQlRsMG9YbjhVb1VhbTVIWFdVWjdFUy...,2023-11-22T05:37:45Z,UCKy1dAqELo0zrOtPkf0eTMw,Bluey: The Videogame Video Review,Bluey: The Videogame reviewed by Luke Reilly o...,https://i.ytimg.com/vi/_ELijbbsFxs/default.jpg,120.0,90.0,...,_ELijbbsFxs,IGN,UCKy1dAqELo0zrOtPkf0eTMw,_ELijbbsFxs,2023-11-22T05:39:40Z,public,youtube#playlistItemListResponse,I0myo1_FOBPYpac5a9VYovdGA_A,EAAaL1BUOkNESWlFRVF6TmpaR1FqUTFNa00wTVVFMFJVWW...,"{'totalResults': 2178, 'resultsPerPage': 50}"
3,youtube#playlistItem,UFiIjmkfyC8UWJrf5Pu123MdcTg,UExyYUZid0NvaXNKQlRsMG9YbjhVb1VhbTVIWFdVWjdFUy...,2023-11-22T01:06:30Z,UCKy1dAqELo0zrOtPkf0eTMw,PlayStation Pulse Explore Wireless Earbuds Review,Sony’s Pulse Explore Earbuds are laser-focused...,https://i.ytimg.com/vi/7wHl8HzvRcI/default.jpg,120.0,90.0,...,7wHl8HzvRcI,IGN,UCKy1dAqELo0zrOtPkf0eTMw,7wHl8HzvRcI,2023-11-22T16:00:04Z,public,youtube#playlistItemListResponse,I0myo1_FOBPYpac5a9VYovdGA_A,EAAaL1BUOkNESWlFRVF6TmpaR1FqUTFNa00wTVVFMFJVWW...,"{'totalResults': 2178, 'resultsPerPage': 50}"
4,youtube#playlistItem,QAlTJCXnZfvqHWjZyX0Y0Dh_-_s,UExyYUZid0NvaXNKQlRsMG9YbjhVb1VhbTVIWFdVWjdFUy...,2023-11-21T06:46:53Z,UCKy1dAqELo0zrOtPkf0eTMw,Flashback 2 Review,Flashback 2 reviewed by Tristan Ogilvie on Pla...,https://i.ytimg.com/vi/u0yA9jPNDc8/default.jpg,120.0,90.0,...,u0yA9jPNDc8,IGN,UCKy1dAqELo0zrOtPkf0eTMw,u0yA9jPNDc8,2023-11-21T06:49:06Z,public,youtube#playlistItemListResponse,I0myo1_FOBPYpac5a9VYovdGA_A,EAAaL1BUOkNESWlFRVF6TmpaR1FqUTFNa00wTVVFMFJVWW...,"{'totalResults': 2178, 'resultsPerPage': 50}"


#### Tests

##### Test:
- using the `columns` keyword argument in `to_csv` so that can add new columns while keeping old ones

In [111]:
df1 = pd.DataFrame({
    "animal": ["cow", "parrot"],
    "name": ["moomoo", "polly"]
})
df1.to_csv("test.csv", index=False)
df1

Unnamed: 0,animal,name
0,cow,moomoo
1,parrot,polly


In [93]:
df2 = pd.DataFrame({
    "animal": ["scorpion"],
    "name": ["gliscor"],
    "legs": [6]
})
df2

Unnamed: 0,animal,name,legs
0,scorpion,gliscor,6


In [100]:
read_df = pd.read_csv("test.csv")
read_df.head(0)

Unnamed: 0,animal,name


In [109]:
# really just need a way to get the right column order.

# first approach
# just concat an empty df that has the old columns with the new df
to_concat = [
    read_df.head(0),
    df2.head(0)
]
cat_df = pd.concat(to_concat)
print(list(cat_df.columns))
cat_df

['animal', 'name', 'legs']


Unnamed: 0,animal,name,legs


In [112]:
df1.to_csv('test.csv', mode='a', header=False, index=False, columns=list(cat_df.columns))

KeyError: "['legs'] not in index"

In [110]:
# see if appending to csv with the concatenated df will add the new columns
df2.to_csv('test.csv', columns=list(cat_df.columns), mode='a', header=False, index=False)

##### Test:
- CSV file has 2 headers
- Append to that CSV a dataframe with 3 headers

Does the CSV "properly" update?

In [88]:
df1 = pd.DataFrame({
    "animal": ["cow", "parrot"],
    "name": ["moomoo", "polly"]
})
df1.to_csv("test.csv", index=False)
df1

Unnamed: 0,animal,name
0,cow,moomoo
1,parrot,polly


In [89]:
df2 = pd.DataFrame({
    "animal": ["scorpion"],
    "name": ["gliscor"],
    "legs": [8]
})
df2

Unnamed: 0,animal,name,legs
0,scorpion,gliscor,8


In [90]:
df2.to_csv("test.csv", mode="a", index=False, header=True) # maybe it's the header argument?

Doesn't seem like there's a way to directly add a column to a CSV file. You would have to read the CSV file as a data frame, concatenate it with the new one, then write to CSV (not append). Surprised that a method does not exist to do that efficiently.

For now, if there's a new column, go in write mode. Otherwise, append.headers

##### Test: Read CSV with less headers/columns than in-memory data frames

In [31]:
df1 = pd.read_csv("test.csv")
df1

Unnamed: 0,animal,name
0,cow,moo
1,parrot,polly


In [32]:
df2 = df1.copy()
df2['legs'] = [4, 2]
df2

Unnamed: 0,animal,name,legs
0,cow,moo,4
1,parrot,polly,2


In [35]:
df3 = pd.concat([df1, df2], ignore_index=True)
df3

Unnamed: 0,animal,name,legs
0,cow,moo,
1,parrot,polly,
2,cow,moo,4.0
3,parrot,polly,2.0


In [36]:
df3.to_csv("test.csv", index=False)

In [37]:
df4 = pd.read_csv("test.csv")
df4

Unnamed: 0,animal,name,legs
0,cow,moo,
1,parrot,polly,
2,cow,moo,4.0
3,parrot,polly,2.0
