In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline

* Pandas has two important data strucures **Series** and **DataFrame**

### Series

**Series** is a one-dimensional labeled array capable of holding any data type (integers, strings, floating point numbers, Python objects, etc.)

```python
>>> s = pd.Series(data, index=index)
```

<img src="images/fig_pd_series.png" alt="Pandas Series Example" height="250" width="250" align = "left">

In [None]:
person_height_ft = pd.Series([5.5,5.2,5.8,6.1,4.8],name='height',
                index = ['person_a','person_b','person_c','person_d','person_e'],dtype=np.float64)
person_height_ft

In [None]:
person_height_ft.values

In [None]:
person_height_ft.index

* A Series is like a fixed-size dict in that you can get and set values by index label

In [None]:
person_height_ft['person_c']

* You can also use the index position to get and set the values

In [None]:
person_height_ft[3]

In [None]:
person_height_ft[0:3]

####  Vectorized operations and label alignment with Series

In [None]:
person_height_mtr = (12* 2.54) * person_height_ft/100
person_height_mtr

In [None]:
# please note the index which is not same as height
person_weight_kg = pd.Series([70,55,73,68,66],name='weight',
                index = ['person_b','person_d','person_e','person_c','person_a'],dtype=np.float64)
person_weight_kg

In [None]:
#Let's calculate BMI
bmi = person_weight_kg/person_height_mtr**2
bmi

In [None]:
#BMI of person_c
68/(1.76784*1.76784)

### DataFrame

* DataFrame is a 2-dimensional labeled data structure with columns of potentially different types. You can think of it like a spreadsheet or SQL table, or a dict of Series objects. It is generally the most commonly used pandas object. 

<img src="images/fig_pd_data_frame.png" alt="Pandas DataFrame Example" height="400" width="400" align = "left">

In [None]:
index = ['person_a','person_b','person_c','person_d','person_e']
df_person = pd.DataFrame({'height':[5.5,5.2,5.8,6.1,4.8],'weight':[66,70,68,55,73],
                   'gender':['male','male','female','male','female']}, index=index)

In [None]:
df_person.describe(include='all')

In [None]:
print(df_person.ndim)
print(df_person.shape)
print(df_person.dtypes)
print(df_person.columns)
print(df_person.index)
print(len(df_person))

In [None]:
df_person.info()

In [None]:
df_person.reset_index()

In [None]:
df_person.reset_index().set_index('gender')

In [None]:
another_index = ['a','b','c','d','e']
df1 = df_person.reset_index()
df1.index = another_index
df1

### Accessing Data

In [None]:
# Accessing the column data
height = df_person['height']
height

In [None]:
type(height)

In [None]:
# Accessing multiple columns
df_person[['height','weight']]

In [None]:
# Accessinng one individual cell
df_person['height']['person_a']

In [None]:
# Accessing rows by index keys
df_person.loc['person_a']

In [None]:
# Accessing multiple rows using range
df_person['person_a':'person_c']

In [None]:
# Accessing using index position
print(df_person.iloc[0])
print("--------------")
print(df_person.iloc[0,2])

In [None]:
# Boolean indexinng
# all persons with height > 5.2 feet
df_person[df_person.height > 5.2]

In [None]:
# Boolean indexinng
# all persons with height > 5.2 feet and weight > 60kgs
df_person[(df_person.height > 5.2) & (df_person.weight > 60)]

In [None]:
df_person

In [None]:
# Let's add new column "age" to the DataFrame
df_person['age'] = pd.Series([30,28,26,19,42], index=index)

In [None]:
df_person

In [None]:
# Find all perons with age > 28

In [None]:
# Find females with age > 28

In [None]:
# Find max aged person
df_person[df_person.age==df_person.age.max()]

In [None]:
# Find max aged male person

In [None]:
# Find all persons having height > average height of the group

### Handling missing values

In [None]:
df_backup = df_person.copy()

In [None]:
df_person = df_backup.copy()

In [None]:
# Let's introduce few NaN values 
df_person.loc['person_a','age'] = np.NaN
df_person.iloc[2,2] = np.NaN
df_person.loc['person_e','height'] = np.NaN
df_person.loc['person_f'] = np.NaN
df_person['married'] = np.NaN

In [None]:
df_person

In [None]:
# how takes 'all' or 'any'
# dropping all of the rows if all of the values are np.NaN
df_person.dropna(how='all')

In [None]:
# how takes 'all' or 'any'
# dropping all of the columns if all of the values are np.NaN
df_person.dropna(axis=1,how='all')

In [None]:
# Filling all of the NaN values with zero
df_person.fillna(0)

In [None]:
# replace NaN weight with average weight of the group
#df_person.weight.mean()
df_person['weight'].fillna(df_person.weight.mean())

In [None]:
df_person.fillna(method='ffill')

In [None]:
df_person.fillna(method='bfill')

In [None]:
df_person[df_person['weight'].notnull()]

In [None]:
df_person[df_person.notnull()['age']]

In [None]:
df_person = df_person.dropna(how='all')
df_person = df_person.dropna(how='all',axis=1)
df_person

In [None]:
df_person = df_person.fillna(method='ffill')
df_person

In [None]:
df_person['bmi'] = df_person['weight']/(((12* 2.54) * df_person['height']/100)**2)
df_person

### GroupBy function

* Splitting the data into groups based on some criteria
* Applying a function to each group independently
* Combining the results into a data structure

<img src="images/fig_pd_groupby.jpg" alt="Pandas GroupBy" height="350" width="350" align="left">

In [None]:
df_person

In [None]:
df_person_grp = df_person.groupby('gender')
print(type(df_person_grp))

In [None]:
for group,data in df_person_grp:
    print(group, data)
    print("--------------------------------")

In [None]:
df_person_grp.mean()

In [None]:
df_person_grp.mean().plot(kind='bar')

### Working with Text Data

In [None]:
df_person.apply(lambda x: x['gender'].upper()[0], axis=1)

In [None]:
df_person.columns

In [None]:
df_person[['weight','height']].apply(lambda x: x.dtype)

In [None]:
df_person.mean()

In [None]:
df_person['gender'].str.upper().str[0]

### Working with Dates and TimeSeries Data

In [None]:
# settig seed ?
np.random.seed(5)
price = pd.Series(np.random.randint(100,high=150,size=150),
                  index=pd.date_range('2000-1-1', periods=150, freq='B'),name='col1')

In [None]:
price.head()

In [None]:
price.groupby(pd.TimeGrouper('1M')).max().plot(ylim=(146,150))

In [None]:
np.random.seed(5)
price1 = pd.Series(np.random.randint(200,high=250,size=500),
                   index=pd.date_range('2000-1-1', periods=500, freq='D'),name='col2')
#all_days = pd.date_range('2000-1-1', periods=500, freq='D')

In [None]:
df_time = pd.DataFrame({'col1':price,'col2':price1})

In [None]:
df_time.head()

In [None]:
len(df_time)

In [None]:
df_time.dtypes

In [None]:
df_time.plot(figsize=(16,8))

In [None]:
df_time.groupby(pd.TimeGrouper('1M')).mean().plot()

In [None]:
pd.Categorical?

In [None]:
pd.CategoricalIndex?

In [None]:
df

In [None]:
x = pd.Categorical(df_time['label'],ordered=True)

In [None]:
cat = pd.Series(df['label'], dtype=x)

In [None]:
cat

In [None]:
pd.merge?

In [None]:
pd.concat?

In [None]:
pd.Timestamp?

In [None]:
plt.plot([1,2,3,3.5,4.0],[1,2,3,3.2,3.8], 
         color='green', linestyle='dashed', 
         marker='o',markerfacecolor='blue', 
         markersize=8)