
# Data Sampling
This notebook allows sampling of a csv file without loading the entire file into memory.

In [None]:
from google.colab import drive
import csv
import sys
import pandas as pd
import random

from IPython.display import clear_output #clear print during loops

In [None]:
#get data from google drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
filename = '/content/drive/MyDrive/data/guardian_articles.csv' # large dataset

In [None]:
csv.field_size_limit(sys.maxsize) #avoids _csv.Error: field larger than field limit (131072)

131072

In [None]:
# Open the CSV file in read-only mode
with open(filename, 'r') as file:

    # Create a CSV reader object
    reader = csv.reader(file)

    # Peak at the first row to understand headers
    for i in range(1):
        try:
            row = next(reader)
            for items in row:
              item = items.split(", ")
              print(item)
            #print(row)
        except StopIteration:
            break
    #loop through file to get total rows
    row_count = sum(1 for row in reader) #may take some time with big files

['article_id']
['sectionName']
['webTitle']
['webUrl']
['bodyContent']
['webPublicationDate']
['id']


In [None]:
print(str(row_count) + " rows found in the file.")

149839 rows found in the file.


In [None]:
sample_size_perc = 0.1 # 10%

In [None]:
sample_size = int(row_count*sample_size_perc) #get 10%
print(str(sample_size) + " rows will be sampled")

14983 rows will be sampled


In [None]:
# Create empty dataframe
df = pd.DataFrame(columns=['article_id', 'sectionName', 'webTitle', 'webUrl', 'bodyContent', 'webPublicationDate', 'id'])
df.head()

Unnamed: 0,article_id,sectionName,webTitle,webUrl,bodyContent,webPublicationDate,id


In [None]:
sample_ids = []
for i in range(0,sample_size):
  n = random.randint(1, row_count)
  while n in sample_ids: #resample if duplicate
    n = random.randint(1, row_count)
  sample_ids.append(n)
sample_rows = sorted(sample_ids)
print(str(len(sample_rows)) + " row ids to sample.")

14983 row ids to sample.


In [None]:
for i in range(10):#check some of the sample row ids
  print(sample_rows[i])

8
23
26
45
59
79
95
152
160
171


In [None]:
# Open CSV file again in read-only mode but this time get samples
with open(filename, 'r') as file:

    # Create a CSV reader object
    reader = csv.reader(file)

    # Read the first row to understand headers
    for i in range(row_count):
        clear_output(wait=True)
        try:
            row = next(reader)
            if i in sample_rows:
              #items = #[item.split(",") for item in row]
              df.loc[len(df)] = row[:]
        except StopIteration:
            break
        print(str(round((i/row_count)*100, 2)) + "% ("+ str(i) + "/" + str(row_count) + ") complete") #may take some time

100.0% (149838/149839) complete


In [None]:
len(df)

14983

In [None]:
df.head()

Unnamed: 0,article_id,sectionName,webTitle,webUrl,bodyContent,webPublicationDate,id
0,football/2016/jan/31/saido-berahino-west-brom-...,Football,Saido Berahino has right attitude but he’s not...,https://www.theguardian.com/football/2016/jan/...,Tony Pulis hopes his only transfer business be...,2016-01-31T22:30:10Z,8
1,sport/2016/jan/31/angelique-kerber-serena-will...,Sport,Angelique Kerber now aims to dislodge Serena W...,https://www.theguardian.com/sport/2016/jan/31/...,Gone midnight and Angelique Kerber was conduct...,2016-01-31T21:59:09Z,23
2,world/2016/feb/01/the-australian-family-who-ha...,World news,The family building a refugee haven in the sha...,https://www.theguardian.com/world/2016/feb/01/...,On 9 June 2014 Queenslander and charity worker...,2016-01-31T21:27:40Z,26
3,sport/2016/jan/31/worcester-exeter-premiership...,Sport,Exeter keep Saracens in their sights with bonu...,https://www.theguardian.com/sport/2016/jan/31/...,There is no need for calculators this week but...,2016-01-31T19:12:58Z,45
4,artanddesign/2016/jan/31/exposed-photographys-...,Art and design,Exposed: photography's fabulous fakes,https://www.theguardian.com/artanddesign/2016/...,"In 1840, Hippolyte Bayard, a pioneer of early ...",2016-01-31T18:00:03Z,59


In [None]:
df.columns

Index(['webTitle', 'bodyContent', 'sectionName'], dtype='object')

In [None]:
#drop the columns we don't need for our analysis/modelling
df = df.drop(columns=['article_id', 'webUrl', 'webPublicationDate', 'id'])

KeyError: ignored

In [None]:
df.columns

Index(['webTitle', 'bodyContent', 'sectionName'], dtype='object')

In [None]:
#reorder columns so we have our features X and our target y
df = df[['webTitle', 'bodyContent', 'sectionName']]

In [None]:
df.columns

Index(['webTitle', 'bodyContent', 'sectionName'], dtype='object')

In [None]:
#check missing data
df.isnull().sum() #none missing

webTitle       0
bodyContent    0
sectionName    0
dtype: int64

In [None]:
df.webTitle[50]

'How do I ... know if I have a mental illness?'

In [None]:
df.bodyContent[50]

'Mental health problems affect one in four of us at any one time. Though accurate figures can be difficult to obtain, it is estimated that 450 million people worldwide have a mental health problem. What is mental illness? There are more than 200 clinically diagnosable mental health conditions, very roughly organised into five major categories. These are: mood disorders, anxiety disorders, schizophrenia and psychotic disorders, eating disorders and dementia. Depression is the most common mental illness. The World Health Organisation estimates that by 2020 depression will be the second leading cause of disability globally, after heart disease. Other common mental illnesses include: general anxiety disorder, bipolar disorder, schizophrenia and anorexia. Lesser known, but just as debilitating, conditions include trichotillomania (a compulsion to pull out one’s hair) and pica (the eating of non-edible items). Different conditions are more prevalent in different parts of the globe. Obsessive

In [None]:
df.sectionName[50]

'UK news'

In [None]:
savefilename = filename[:-4]+ "_" + str(int(round(sample_size_perc*100,0))) + "_perc.csv"
print(savefilename)

/content/drive/MyDrive/data/guardian_articles_10_perc.csv


In [None]:
#save sample to csv file
df.to_csv(savefilename, sep=',', encoding='utf-8', index=False)