# Preliminary examination of logged electricity demand and and generation data from SSEN website 

## load sql extension and connect to database file

In [1]:
## Make copy of original databse for examination and cleaning

from shutil import copyfile

copyfile('../logging/database/eleclog.db','./database/eleclog_copy.db')

'./database/eleclog_copy.db'

In [2]:
%load_ext sql

In [3]:
%sql sqlite:///./database/eleclog_copy.db

'Connected: @./database/eleclog_copy.db'

In [4]:
%sql SELECT COUNT(*) FROM readings;

 * sqlite:///./database/eleclog_copy.db
Done.


COUNT(*)
395583


## Get some basic information about the database

In [5]:
%sql pragma table_info('readings')

 * sqlite:///./database/eleclog_copy.db
Done.


cid,name,type,notnull,dflt_value,pk
0,timestamp,DATETIME,0,,0
1,demand,NUMERIC,0,,0
2,demand_max,NUMERIC,0,,0
3,anm_generated,NUMERIC,0,,0
4,non_anm_generated,NUMERIC,0,,0
5,total_generation_capacity,NUMERIC,0,,0


In [6]:
%sql SELECT min(timestamp), max(timestamp) FROM readings;

 * sqlite:///./database/eleclog_copy.db
Done.


min(timestamp),max(timestamp)
2019-01-16 14:36:03,2019-10-27 21:48:06


In [7]:
%sql SELECT * from readings limit 10;

 * sqlite:///./database/eleclog_copy.db
Done.


timestamp,demand,demand_max,anm_generated,non_anm_generated,total_generation_capacity
2019-01-16 14:36:03,23.22,35.7,12.478,17.782,57.1
2019-01-16 14:36:12,23.11,35.7,12.801,17.716,57.1
2019-01-16 14:36:17,23.11,35.7,12.801,17.716,57.1
2019-01-16 14:52:03,23.48,35.7,14.107,17.92,57.1
2019-01-16 14:53:03,23.37,35.7,14.039,17.936,57.1
2019-01-16 14:54:04,23.83,35.7,14.611,17.721,57.10000000000001
2019-01-16 14:55:03,23.48,35.7,14.523,17.698,57.1
2019-01-16 14:56:03,23.5,35.7,14.675,17.816,57.10000000000001
2019-01-16 14:57:03,22.77,35.7,14.562,17.157,57.1
2019-01-16 14:58:03,23.17,35.7,14.622,17.689,57.1


## Search for duplicate entries

In [8]:
%%sql 
SELECT timestamp, demand, COUNT(*) count 
FROM readings 
GROUP BY timestamp 
HAVING count > 1;

 * sqlite:///./database/eleclog_copy.db
Done.


timestamp,demand,count
2019-01-22 17:48:08,24.06,2
2019-03-13 23:21:05,14.64,2


## Search for multiple readings within single minute periods 

In [9]:
%%sql 
SELECT substr(timestamp, 0, 17) as timestamp, COUNT(*) as count 
FROM readings 
GROUP BY substr(timestamp, 0, 17) 
HAVING count > 1;

 * sqlite:///./database/eleclog_copy.db
Done.


timestamp,count
2019-01-16 14:36,3
2019-01-17 21:26,2
2019-01-17 21:30,2
2019-01-18 12:35,2
2019-01-18 12:44,2
2019-01-18 12:48,2
2019-01-18 12:50,2
2019-01-18 12:57,2
2019-01-18 13:14,2
2019-01-18 13:27,2


count of multiple entries within any single minute period

In [10]:
%sql SELECT sum(count) from (SELECT substr(timestamp, 0, 17) as timestamp, COUNT(*) as count FROM readings GROUP BY substr(timestamp, 0, 17) HAVING count > 1);

 * sqlite:///./database/eleclog_copy.db
Done.


sum(count)
109


count of minute periods with multiple entries

In [11]:
%%sql
SELECT count(*) from (SELECT substr(timestamp, 0, 17) as timestamp, COUNT(*) as count FROM readings GROUP BY substr(timestamp, 0, 17) HAVING count > 1);

 * sqlite:///./database/eleclog_copy.db
Done.


count(*)
51


In [12]:
%%sql
SELECT timestamp 
FROM readings 
WHERE substr(timestamp, 0, 17) IN 
(SELECT timestamp 
 FROM (SELECT substr(timestamp, 0, 17) as timestamp, COUNT(*) c 
       FROM readings 
       GROUP BY substr(timestamp, 0, 17) 
       HAVING c > 1
      )
);

 * sqlite:///./database/eleclog_copy.db
Done.


timestamp
2019-01-16 14:36:03
2019-01-16 14:36:12
2019-01-16 14:36:17
2019-01-17 21:26:17
2019-01-17 21:26:21
2019-01-17 21:30:13
2019-01-17 21:30:54
2019-01-18 12:35:06
2019-01-18 12:35:23
2019-01-18 12:44:09


get readings with row id numbers


In [13]:
%%sql SELECT ROWID, substr(timestamp, 0, 17) as datetime, substr(timestamp, 18, 19) as seconds 
FROM readings 
WHERE substr(timestamp, 0, 17)
IN (SELECT timestamp 
    FROM (SELECT substr(timestamp, 0, 17) as timestamp, COUNT(*) c  
          FROM readings
          GROUP BY substr(timestamp, 0, 17)
          HAVING c > 1
         )
   )
;

 * sqlite:///./database/eleclog_copy.db
Done.


rowid,datetime,seconds
7,2019-01-16 14:36,3
8,2019-01-16 14:36,12
9,2019-01-16 14:36,17
1842,2019-01-17 21:26,17
1843,2019-01-17 21:26,21
1845,2019-01-17 21:30,13
1846,2019-01-17 21:30,54
2744,2019-01-18 12:35,6
2745,2019-01-18 12:35,23
2753,2019-01-18 12:44,9


## obtain records for deletion

In [14]:
query ='''SELECT ROWID, substr(timestamp, 0, 17) as datetime, substr(timestamp, 18, 19) as seconds 
FROM readings 
WHERE substr(timestamp, 0, 17)
IN (SELECT timestamp 
    FROM (SELECT substr(timestamp, 0, 17) as timestamp, COUNT(*) c  
          FROM readings
          GROUP BY substr(timestamp, 0, 17)
          HAVING c > 1
         )
   )
;'''
problem_items = %sql $query

 * sqlite:///./database/eleclog_copy.db
Done.


In [15]:
 problem_items[0]['rowid'], problem_items[0]['datetime'], problem_items[0]['seconds']

(7, '2019-01-16 14:36', '03')

In [16]:
marked = []
lastdate = ""
for i in problem_items:
    if i['datetime']== lastdate:
        marked.append(i['rowid'])
    else:
        lastdate=i['datetime']  

In [17]:
len(marked)

58

In [18]:
marked = tuple(marked)

In [19]:
%sql select rowid, timestamp from readings where rowid in $marked;

 * sqlite:///./database/eleclog_copy.db
Done.


rowid,timestamp
8,2019-01-16 14:36:12
9,2019-01-16 14:36:17
1843,2019-01-17 21:26:21
1846,2019-01-17 21:30:54
2745,2019-01-18 12:35:23
2754,2019-01-18 12:44:15
2758,2019-01-18 12:48:09
2760,2019-01-18 12:50:10
2767,2019-01-18 12:57:45
2783,2019-01-18 13:14:16


In [20]:
%sql delete from readings where rowid in $marked;

 * sqlite:///./database/eleclog_copy.db
58 rows affected.


[]

eleclog_copy.db now has all duplicate records removed
