## Pandas Data Frames

- Applying functions, working with Numpy functions
- Concatenating & Merging
- Dealing with Missing values
- Dummy Variables
- I/O operations

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
from IPython.display import Image
from IPython.display import HTML
from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))


In [None]:
from IPython.display import display, HTML

CSS = """
.output {
    align-items: center;
}
div.output_area {
    width: 80%;
}
"""
HTML('<style>{}</style>'.format(CSS))

In [None]:
data = {
    'name': ['Xavier', 'Ann', 'Jana', 'Yi', 'Robin', 'Amal', 'Nori'],
    'city': ['Mexico City', 'Toronto', 'Prague', 'Shanghai',
             'Manchester', 'Cairo', 'Osaka'],
    'age': [41, 28, 33, 34, 38, 31, 37],
    'py-score': [88.0, 79.0, 81.0, 80.0, 68.0, 61.0, 84.0],
    'js-score': [71.0, 95.0, 88.0, 79.0, 91.0, 91.0, 80.0]

}

students_df = pd.DataFrame(data=data)
students_df

### Applying functions

### Applying Arithmetic functions

In [None]:
students_df

In [None]:
## adding computed column , feature engineering 
students_df['total'] =\
(0.4 * students_df['py-score'] + 0.3 * students_df['js-score'])/0.7
students_df


In [None]:
students_df.iloc[:, 3:5]

### using existing NumPy function instead of writing your own function

In [None]:
np.average(students_df.iloc[:, 3:5], axis=1,
                         weights=[0.4, 0.3])

In [None]:
students_df['total']= np.average(students_df.iloc[:, 3:5], axis=1,
                         weights=[0.4, 0.3])

In [None]:
students_df['py-score']

### Apply function

In [None]:
students_df[["py-score"]].apply(lambda x :  x* 10)

In [None]:
students_df[["py-score","js-score"]].apply(np.sqrt)

In [None]:
students_df

In [None]:
students_df['total'] = 50
students_df

## Map function with Lambda

In [None]:
students_df["py-score"]= list(map(lambda x:x+10,students_df["py-score"]))
students_df

In [None]:
students_df["py-score"]= students_df["py-score"].apply(lambda x: x+10)
students_df

### Concatenating
- concat mutilpe data frames on the same axis.
- pd.concat()

In [None]:
Image("res/merging_concat_dict_keys.png")

source:pandas.pydata.org

In [None]:
print('df = pd.DataFrame(np.random.randn(10, 4))')
df = pd.DataFrame(np.random.randn(7, 4))
print(df)
print("\npieces = [df[:2], df[2:4], df[4:]]")
pieces = [df[:2], df[2:4], df[4:]]
pieces

In [None]:
print("\npd.concat(pieces)")
pd.concat(pieces)

### Working with Missing Data

### np.nan is used to represent missing values

In [None]:
print("df_ = pd.DataFrame({'x': [1, 2, np.nan, 4, np.nan]})")
df_ = pd.DataFrame({'x': [1, 2, np.nan, 4, np.nan]})
df_

In [None]:
df_["y"]=[2,np.nan,4,5, np.nan]
df_

### Dropping rows that contain missing values 

In [None]:
df_.dropna(subset=['x'])


### and you can fill the missing values with fillna..

<b>  filling nas with mean

In [None]:
df_.mean()

In [None]:
df_[["x"]].fillna(value=df_.x.mean())

In [None]:
df_

In [None]:
df_ = pd.DataFrame({'x': [1, 2, np.nan, 4]})
print('df_.fillna(value=0)\n',df_.fillna(value=0))
print("\ndf_.fillna(method=ffill)\n",df_.fillna(method='ffill'))
print("\ndf_.fillna(method=bfill)\n",df_.fillna(method='bfill'))

## Dummy variables
- Dummy variables are binary variables that represent the values that categorical variables hold. 
- It is an efficient representation for programming, preprocessing, analysing and ML tasks purposes.

#### Here is an example...

In [None]:
df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
                   'C': [1, 2, 3]})
df

In [None]:
pd.get_dummies(df)

In [None]:
df.apply(np.unique)

### I/O operations
- reading data directly from csv, text, Excel files, DFS
- Pandas Data Frames are not persistent storage!
- writing data directly to file storage.

In [None]:
students_df

In [None]:
students_df.to_csv("students_df.csv", index=False)

In [None]:
Image("res/Q&A.png")

In [None]:
pd.read_csv('students_df.csv')