# Using Pandas

Pandas is powerful and easy-to-use library for data analysis. Is has two main object to represents data: 

- Series
- DataFrame


# Import libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
%matplotlib inline

# Working with Series

Series is an array like object.

In [3]:
x = pd.Series([1,2,3,4,5])
x

0    1
1    2
2    3
3    4
4    5
dtype: int64

Notice that generated index for your item

## Basic Operation

In [4]:
x + 100


0    101
1    102
2    103
3    104
4    105
dtype: int64

In [5]:
(x ** 2) + 100

0    101
1    104
2    109
3    116
4    125
dtype: int64

In [6]:
x > 2

0    False
1    False
2     True
3     True
4     True
dtype: bool

## `any()` and `all()`

In [7]:
larger_than_2 = x > 2
larger_than_2

0    False
1    False
2     True
3     True
4     True
dtype: bool

In [8]:
larger_than_2.any()

True

In [9]:
larger_than_2.all()

False

## `apply()`

In [10]:
def f(x):
    if x % 2 == 0:
        return x * 2
    else:
        return x * 3

x.apply(f)

0     3
1     4
2     9
3     8
4    15
dtype: int64

**Avoid looping over your data**

This is a `%%timeit` results from `apply()` and a for loop.

In [11]:
%%timeit
ds = pd.Series(range(10000))

for counter in range(len(ds)):
    ds[counter] = f(ds[counter])

1 loop, best of 3: 327 ms per loop


In [12]:
%%timeit

ds = pd.Series(range(10000))

ds = ds.apply(f)

10 loops, best of 3: 10.5 ms per loop


## `copy()`

In [13]:
y = x

In [14]:
y[0]

1

In [15]:
y[0] = 100

In [16]:
y

0    100
1      2
2      3
3      4
4      5
dtype: int64

In [17]:
x

0    100
1      2
2      3
3      4
4      5
dtype: int64

# DataFrame

In [18]:
data = [1,2,3,4,5,6,7,8,9]
df = pd.DataFrame(data, columns=["x"])

In [19]:
df

Unnamed: 0,x
0,1
1,2
2,3
3,4
4,5
5,6
6,7
7,8
8,9


## Selecting Data

In [20]:
df["x"]

0    1
1    2
2    3
3    4
4    5
5    6
6    7
7    8
8    9
Name: x, dtype: int64

In [21]:
df["x"][0]

1

## Adding extra columns

In [22]:
df["x_plus_2"] = df["x"] + 2
df

Unnamed: 0,x,x_plus_2
0,1,3
1,2,4
2,3,5
3,4,6
4,5,7
5,6,8
6,7,9
7,8,10
8,9,11


In [23]:
df["x_square"] = df["x"] ** 2
df["x_factorial"] = df["x"].apply(np.math.factorial)
df

Unnamed: 0,x,x_plus_2,x_square,x_factorial
0,1,3,1,1
1,2,4,4,2
2,3,5,9,6
3,4,6,16,24
4,5,7,25,120
5,6,8,36,720
6,7,9,49,5040
7,8,10,64,40320
8,9,11,81,362880


In [None]:
df["is_even"] = df["x"] % 2
df

### `map()`

In [None]:
df["odd_even"] = df["is_even"].map({1:"odd", 0:"even"})
df

### `drop()`

In [None]:
df = df.drop("is_even", 1)
df

## Multi Column Select

In [None]:
df[["x", "odd_even"]]

## Controlling display options

In [None]:
pd.options.display.max_columns= 60
pd.options.display.max_rows= 6
pd.options.display.notebook_repr_html = False
df

## Filtering

In [None]:
df[df["odd_even"] == "odd"]

In [None]:
df[df.odd_even == "even"]

### Chaining Filters

#### `|` OR

In [None]:
df[(df.odd_even == "even") | (df.x_square < 20)]

#### `&` AND

In [None]:
df[(df.odd_even == "even") & (df.x_square < 20)]

# `scatter_matrix()`

In [None]:
pd.scatter_matrix(df,diagonal="kde",figsize=(10,10));

In [None]:
df.describe()

# Reading Data from CSV/TSV Files

In [None]:
#url = "http://www.google.com/finance/historical?q=TADAWUL:TASI&output=csv"
#stocks_data = pd.read_csv(url)

stocks_data = pd.read_csv('./Datasets/stocks.csv')

In [None]:
stocks_data

In [None]:
stocks_data["change_amount"] = stocks_data["Close"] - stocks_data["Open"]
stocks_data["change_percentage"] = stocks_data["change_amount"] / stocks_data["Close"]
stocks_data