In [2]:
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np

In [3]:
#read in initial datasets
data = pd.read_csv("2010-2014.csv", low_memory=False)
data2 = pd.read_csv("2014-2021.csv", low_memory=False)

In [4]:
#examine null prcp stats
print('Overall dataset size: ', data['PRCP'].size + data2['PRCP'].size)
print('Dataset size w/o null PRCP values: ', data.loc[data['PRCP'].notnull()]['PRCP'].size + data2.loc[data2['PRCP'].notnull()]['PRCP'].size)
print('Percentage of null values: ', (data.loc[data['PRCP'].isnull()]['PRCP'].size + data2.loc[data2['PRCP'].isnull()]['PRCP'].size) / (data['PRCP'].size + data2['PRCP'].size) * 100)

Overall dataset size:  141484
Dataset size w/o null PRCP values:  107329
Percentage of null values:  24.140538859517687


In [5]:
#create new dataset w/o null prcp rows
dataFiltered = data[data['PRCP'].notnull()]
data2Filtered = data2[data2['PRCP'].notnull()]

#append to create a single dataset
data = dataFiltered.append(data2Filtered)

#check to make sure this worked correctly
print('New dataset size: ', data['PRCP'].size)

New dataset size:  107329


In [7]:
#create new dataset w/o null prcp rows
dataFiltered = data[data['ELEVATION'].notnull()]

#append to create a single dataset
data = dataFiltered

#check to make sure this worked correctly
print('New dataset size: ', data['PRCP'].size)

New dataset size:  107322


In [8]:
#split date data into a day, month, year column for each row
data['DATE'] = pd.to_datetime(data['DATE'])
data['DAY'] = data['DATE'].dt.day
data['MONTH'] = data['DATE'].dt.month
data['YEAR'] = data['DATE'].dt.year

#check to verify
data[['DATE', 'DAY', 'MONTH', 'YEAR']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['DATE'] = pd.to_datetime(data['DATE'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['DAY'] = data['DATE'].dt.day
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['MONTH'] = data['DATE'].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_ind

Unnamed: 0,DATE,DAY,MONTH,YEAR
0,2013-01-02,2,1,2013
1,2013-01-03,3,1,2013
2,2013-01-04,4,1,2013
3,2013-01-05,5,1,2013
4,2013-01-06,6,1,2013
...,...,...,...,...
82521,2021-01-01,1,1,2021
82522,2021-01-01,1,1,2021
82523,2021-01-01,1,1,2021
82524,2021-01-01,1,1,2021


In [12]:
#option to load dataframe if this cell was previously run
if input("Enter 'y' to load a csv instead of generating a new dataframe:") == 'y':
    name = input('Enter filename to load (must be in local dir):')
    try: 
        dataUnique = pd.read_csv(name, low_memory=False)
        print('Loaded, dataframe size: ', dataUnique['PRCP'].size)
        #convert date column from string to datetime object
        dataUnique['DATE'] =  pd.to_datetime(dataUnique['DATE'])
    except:
        print('Error loading dataframe, rerun this cell')

else:
    print('Generating dataframe, this may take several minutes...')
    
    #timing to check efficiency
    import time
    startTime = time.time()

    #build new dataframe
    uniqueID = set()
    listRows = []
    update = {}
    rowsToUpdate = {}

    #loop thru each row in the dataset and check for unique combinations of 'DATE' and 'STATION'
    #if the combination has not been found, add new row with all data, else add the 'PRCP' val and id to a dict to update later
    #Based on ShikharDua's answer here: https://stackoverflow.com/questions/10715965/create-pandas-dataframe-by-appending-one-row-at-a-time/17496530#17496530
    for r in data.iterrows():
        id = r[1]['STATION'] + str(r[1]['DATE'])
        if id not in uniqueID:
            newRow = {'ID': id, 'DATE': r[1]['DATE'], 'YEAR': r[1]['YEAR'], 'MONTH': r[1]['MONTH'], 'DAY': r[1]['DAY'], 'LATITUDE': r[1]['LATITUDE'], 'LONGITUDE': r[1]['LONGITUDE'], 'ELEVATION': r[1]['ELEVATION'], 'PRCP': r[1]['PRCP']}
            listRows.append(newRow)
            uniqueID.add(id)
        else:
            update = {id: r[1]['PRCP']}
            rowsToUpdate.update(update)

    #create new dataset with listRows
    dataUnique = pd.DataFrame(listRows)

    #second loop thru rowsToUpdate to update dataframe
    for i, p in rowsToUpdate.items():
        dataUnique.loc[dataUnique['ID'] == i]['PRCP'] += p

    #print total time to run this cell
    print('Time to run: ', time.time() - startTime)

    print('New dataset size: ', dataUnique['PRCP'].size)

    #option to save dataframe to skip rerunning this cell
    if input("Enter 'y' to save the generated dataframe:") == 'y':
        name = input('Enter filename to save as:')
        dataUnique.to_csv(name+'.csv', index=False, header=True)
        print('Saved file to local directory')

Enter 'y' to load a csv instead of generating a new dataframe:n
Generating dataframe, this may take several minutes...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataUnique.loc[dataUnique['ID'] == i]['PRCP'] += p


Time to run:  54.60746955871582
New dataset size:  43136
Enter 'y' to save the generated dataframe:y
Enter filename to save as:filteredData
Saved file to local directory


In [1]:
#graph timeseries vs prcp
plt.scatter(dataUnique['DATE'], dataUnique['PRCP'], s=10)
plt.title('Daily precipitation totals 2010-2021')
plt.ylabel('Precipitation (in)')
plt.show()

plt.plot(np.arange(1, 13), dataUnique.groupby(['MONTH']).mean()['PRCP'])
plt.title('Monthly precipitation averages 2010-2021')
plt.xlabel('Month')
plt.ylabel('Average daily precipitation (in)')
plt.xticks(np.arange(1, 13, step=1))
plt.yticks(np.arange(0, 0.55, step=0.05))
plt.show()

dataUnique.describe()

NameError: name 'plt' is not defined