In [1]:
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import pandas as pd

## Series

In [2]:
s = pd.Series([1.1, 2.2, 3.3, 4.4])
s

0    1.1
1    2.2
2    3.3
3    4.4
dtype: float64

In [3]:
s = pd.Series([1.1,2.2,3.3,4.4], index=['alice', 'bob', 'chris', 'diana'])
s

alice    1.1
bob      2.2
chris    3.3
diana    4.4
dtype: float64

In [4]:
# Or using a dict
s = pd.Series({'a': 1, 'b': 2, 'c': 3})
s

a    1
b    2
c    3
dtype: int64

In [5]:
# Create a Series using a scalar value
s = pd.Series(0, index=range(10))
s

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
dtype: int64

In [6]:
s.index

RangeIndex(start=0, stop=10, step=1)

In [7]:
s.values

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [8]:
pops = pd.Series({
    'London': 8973941,
    'New York': 8336817,
    'Mexico City': 8918653,
    'Los Angeles': 3792621
})
pops

London         8973941
New York       8336817
Mexico City    8918653
Los Angeles    3792621
dtype: int64

In [9]:
for k in pops.keys():
    print(k)

London
New York
Mexico City
Los Angeles


In [10]:
pops/1000000

London         8.973941
New York       8.336817
Mexico City    8.918653
Los Angeles    3.792621
dtype: float64

In [11]:
(pops/100000).std()

24.919790525312745

## Dataframe

In [12]:
empty = pd.DataFrame()
print(empty)

Empty DataFrame
Columns: []
Index: []


In [13]:
list = [1,2,3,4]
df = pd.DataFrame(list)
df

Unnamed: 0,0
0,1
1,2
2,3
3,4


In [14]:
matrix = [[1,2,3],[2,3,4],[3,4,5]]
df = pd.DataFrame(matrix, dtype="float")
print(df)
df.describe()

     0    1    2
0  1.0  2.0  3.0
1  2.0  3.0  4.0
2  3.0  4.0  5.0


Unnamed: 0,0,1,2
count,3.0,3.0,3.0
mean,2.0,3.0,4.0
std,1.0,1.0,1.0
min,1.0,2.0,3.0
25%,1.5,2.5,3.5
50%,2.0,3.0,4.0
75%,2.5,3.5,4.5
max,3.0,4.0,5.0


In [15]:
# dict of lists
data = {'Name': ['Tom', 'Dick', 'Harry'], 'Age': [20,40,60]}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age
0,Tom,20
1,Dick,40
2,Harry,60


In [16]:
# list of dicts
data = [{'a': 123, 'b': 234}, {'a': 123, 'b': 234}]
df = pd.DataFrame(data)
df

Unnamed: 0,a,b
0,123,234
1,123,234


Add columns to a dataset:

In [17]:
df['c'] = pd.Series([123,234])
df

Unnamed: 0,a,b,c
0,123,234,123
1,123,234,234


Sum datframe columns:

In [18]:
df['d'] = df['a'] + df['b'] + df['c']
df

Unnamed: 0,a,b,c,d
0,123,234,123,480
1,123,234,234,591


Delete a column:

In [19]:
df.pop('d')
df

Unnamed: 0,a,b,c
0,123,234,123
1,123,234,234


## Dataframe attributes

In [20]:
print('Number of dimenstions', df.ndim)
print('Number of elements', df.size)
print('Axes', df.axes)

Number of dimenstions 2
Number of elements 6
Axes [RangeIndex(start=0, stop=2, step=1), Index(['a', 'b', 'c'], dtype='object')]


In [21]:
# Transponse 

## Dataframe from excel spreadsheet

In [22]:
df = pd.read_excel('123.xlsx', sheet_name="Sheet1")

ImportError: Missing optional dependency 'xlrd'. Install xlrd >= 1.0.0 for Excel support Use pip or conda to install xlrd.

In [23]:
df.head()

Unnamed: 0,a,b,c
0,123,234,123
1,123,234,234


In [24]:
df.columns

Index(['a', 'b', 'c'], dtype='object')

In [25]:
df.index

RangeIndex(start=0, stop=2, step=1)

## Note that a dataframe is an array of COLUMNS, not an array of rows

In [26]:
for col in df.columns:
    print(col, df[col].min(), df[col].max(), df[col].mean())

a 123 123 123.0
b 234 234 234.0
c 123 234 178.5


In [27]:
df.describe()

Unnamed: 0,a,b,c
count,2.0,2.0,2.0
mean,123.0,234.0,178.5
std,0.0,0.0,78.488853
min,123.0,234.0,123.0
25%,123.0,234.0,150.75
50%,123.0,234.0,178.5
75%,123.0,234.0,206.25
max,123.0,234.0,234.0


## Use the dataframe as an array of rows, use df.iloc

In [28]:
df.iloc[0]

a    123
b    234
c    123
Name: 0, dtype: int64

In [29]:
df.iloc[-20::2]

Unnamed: 0,a,b,c
0,123,234,123


## Slicing ranges works on rows also (NOT COLUMNS!)

In [30]:
df[-20::2] # different notation but same as above

Unnamed: 0,a,b,c
0,123,234,123


In [31]:
df[0:1]

Unnamed: 0,a,b,c
0,123,234,123


In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   a       2 non-null      int64
 1   b       2 non-null      int64
 2   c       2 non-null      int64
dtypes: int64(3)
memory usage: 176.0 bytes
