In [1]:
import psutil, os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Memory Usage

Memory function that returns memory being used by a Python Process

In [2]:
def memory_footprint():
    '''Returns memory (in MB) being used by Python process'''
    mem = psutil.Process(os.getpid()).memory_info().rss
    return (mem / 1024 ** 2)

In [3]:
before = memory_footprint()

In [4]:
N = (1024 ** 2) // 8 # Number of floats that fill 1 MB
x = np.random.randn(50*N) # Random array filling 50 MB

In [5]:
after = memory_footprint()

In [6]:
print('Memory before: {} MB'.format(before))

Memory before: 97.7421875 MB


In [7]:
print('Memory after: {} MB'.format(after))

Memory after: 147.79296875 MB


In [8]:
before = memory_footprint()

In [9]:
x ** 2 # Computes, but doesn't bind results to a variable

array([8.21025134e-01, 1.16550989e-04, 2.43195604e-01, ...,
       7.39721027e-02, 8.14305838e-01, 1.96289492e-03])

In [10]:
after = memory_footprint()

In [11]:
print('Extra memory obtained: {} MB'.format(after - before))

Extra memory obtained: 50.1015625 MB


In [12]:
x.nbytes

52428800

In [13]:
x.nbytes // (1024 ** 2)

50

In [14]:
df = pd.DataFrame(x)

In [15]:
df.memory_usage(index=False)

0    52428800
dtype: int64

In [16]:
df.memory_usage(index=False) // (1024 ** 2)

0    50
dtype: int64

### Data in Chunks

In [25]:
dfs = []

In [26]:
for chunk in pd.read_csv('WDI_csv/WDIData.csv', chunksize=1000):
    is_urban = chunk['Indicator Name'] == 'Urban population (% of total)'
    is_AUS = chunk['Country Code'] == 'AUS'
    filtered = chunk.loc[is_AUS & is_urban]
    dfs.append(filtered)

In [27]:
print(len(dfs))

423


In [31]:
print(filtered)

Empty DataFrame
Columns: [Country Name, Country Code, Indicator Name, Indicator Code, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, Unnamed: 63]
Index: []

[0 rows x 64 columns]


In [29]:
df = pd.concat(dfs)

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1 entries, 92713 to 92713
Data columns (total 64 columns):
Country Name      1 non-null object
Country Code      1 non-null object
Indicator Name    1 non-null object
Indicator Code    1 non-null object
1960              1 non-null float64
1961              1 non-null float64
1962              1 non-null float64
1963              1 non-null float64
1964              1 non-null float64
1965              1 non-null float64
1966              1 non-null float64
1967              1 non-null float64
1968              1 non-null float64
1969              1 non-null float64
1970              1 non-null float64
1971              1 non-null float64
1972              1 non-null float64
1973              1 non-null float64
1974              1 non-null float64
1975              1 non-null float64
1976              1 non-null float64
1977              1 non-null float64
1978              1 non-null float64
1979              1 non-null float64
1980   

In [70]:
df.plot.line(x='Year', y='value')
plt.ylabel('% Urban population')
plt.show()

KeyError: 'Year'