## Pandas Data Frames

- Applying functions, working with Numpy functions
- Concatenating & Merging
- Dealing with Missing values
- Dummy Variables
- I/O operations

In [19]:
%matplotlib inline
import numpy as np
import pandas as pd
from IPython.display import Image
from IPython.display import HTML
from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))


In [20]:
from IPython.display import display, HTML

CSS = """
.output {
    align-items: center;
}
div.output_area {
    width: 80%;
}
"""
HTML('<style>{}</style>'.format(CSS))

In [21]:
data = {
    'name': ['Xavier', 'Ann', 'Jana', 'Yi', 'Robin', 'Amal', 'Nori'],
    'city': ['Mexico City', 'Toronto', 'Prague', 'Shanghai',
             'Manchester', 'Cairo', 'Osaka'],
    'age': [41, 28, 33, 34, 38, 31, 37],
    'py-score': [88.0, 79.0, 81.0, 80.0, 68.0, 61.0, 84.0],
    'js-score': [71.0, 95.0, 88.0, 79.0, 91.0, 91.0, 80.0]

}

students_df = pd.DataFrame(data=data)
students_df

Unnamed: 0,name,city,age,py-score,js-score
0,Xavier,Mexico City,41,88.0,71.0
1,Ann,Toronto,28,79.0,95.0
2,Jana,Prague,33,81.0,88.0
3,Yi,Shanghai,34,80.0,79.0
4,Robin,Manchester,38,68.0,91.0
5,Amal,Cairo,31,61.0,91.0
6,Nori,Osaka,37,84.0,80.0


### Applying functions

### Applying Arithmetic functions

In [22]:
students_df

Unnamed: 0,name,city,age,py-score,js-score
0,Xavier,Mexico City,41,88.0,71.0
1,Ann,Toronto,28,79.0,95.0
2,Jana,Prague,33,81.0,88.0
3,Yi,Shanghai,34,80.0,79.0
4,Robin,Manchester,38,68.0,91.0
5,Amal,Cairo,31,61.0,91.0
6,Nori,Osaka,37,84.0,80.0


In [26]:
## adding computed column , feature engineering 
students_df['total'] =\
(0.4 * students_df['py-score'] + 0.3 * students_df['js-score'])/0.7
students_df


Unnamed: 0,name,city,age,py-score,js-score,total
0,Xavier,Mexico City,41,88.0,71.0,80.714286
1,Ann,Toronto,28,79.0,95.0,85.857143
2,Jana,Prague,33,81.0,88.0,84.0
3,Yi,Shanghai,34,80.0,79.0,79.571429
4,Robin,Manchester,38,68.0,91.0,77.857143
5,Amal,Cairo,31,61.0,91.0,73.857143
6,Nori,Osaka,37,84.0,80.0,82.285714


In [None]:
students_df.iloc[:, 3:5]

### using existing NumPy function instead of writing your own function

In [24]:
np.average(students_df.iloc[:, 3:5], axis=1,
                         weights=[0.4, 0.3])

array([80.71428571, 85.85714286, 84.        , 79.57142857, 77.85714286,
       73.85714286, 82.28571429])

In [None]:
students_df['total']= np.average(students_df.iloc[:, 3:5], axis=1,
                         weights=[0.4, 0.3])

In [None]:
students_df['py-score']

### Apply function

In [27]:
students_df[["py-score"]].apply(lambda x :  x* 10)

Unnamed: 0,py-score
0,880.0
1,790.0
2,810.0
3,800.0
4,680.0
5,610.0
6,840.0


In [28]:
students_df

Unnamed: 0,name,city,age,py-score,js-score,total
0,Xavier,Mexico City,41,88.0,71.0,80.714286
1,Ann,Toronto,28,79.0,95.0,85.857143
2,Jana,Prague,33,81.0,88.0,84.0
3,Yi,Shanghai,34,80.0,79.0,79.571429
4,Robin,Manchester,38,68.0,91.0,77.857143
5,Amal,Cairo,31,61.0,91.0,73.857143
6,Nori,Osaka,37,84.0,80.0,82.285714


In [29]:
students_df['good_student'] = students_df['py-score'].apply(lambda x: True if x>= 80 else False )

In [32]:
students_df[["py-score","js-score"]].apply(np.sqrt)

Unnamed: 0,py-score,js-score
0,9.380832,8.42615
1,8.888194,9.746794
2,9.0,9.380832
3,8.944272,8.888194
4,8.246211,9.539392
5,7.81025,9.539392
6,9.165151,8.944272


In [None]:
students_df

In [None]:
students_df['total'] = 50
students_df

## Map function with Lambda

In [None]:
students_df["py-score"]= list(map(lambda x:x+10,students_df["py-score"]))
students_df

In [None]:
students_df["py-score"]= students_df["py-score"].apply(lambda x: x+10)
students_df

Create a new column called 'py-score-half' which stores the python score divided by 2, using list(map())

In [33]:
students_df['py-score-half']= list(map(lambda x: x/2, students_df['py-score']))

In [34]:
students_df

Unnamed: 0,name,city,age,py-score,js-score,total,good_student,py-score-half
0,Xavier,Mexico City,41,88.0,71.0,80.714286,True,44.0
1,Ann,Toronto,28,79.0,95.0,85.857143,False,39.5
2,Jana,Prague,33,81.0,88.0,84.0,True,40.5
3,Yi,Shanghai,34,80.0,79.0,79.571429,True,40.0
4,Robin,Manchester,38,68.0,91.0,77.857143,False,34.0
5,Amal,Cairo,31,61.0,91.0,73.857143,False,30.5
6,Nori,Osaka,37,84.0,80.0,82.285714,True,42.0


### Concatenating
- concat mutilpe data frames on the same axis.
- pd.concat()

In [None]:
Image("res/merging_concat_dict_keys.png")

source:pandas.pydata.org

In [35]:
print('df = pd.DataFrame(np.random.randn(10, 4))')
df = pd.DataFrame(np.random.randn(7, 4))
print(df)
print("\npieces = [df[:2], df[2:4], df[4:]]")
pieces = [df[:2], df[2:4], df[4:]]
pieces

df = pd.DataFrame(np.random.randn(10, 4))
          0         1         2         3
0  1.405395  0.851792  0.259943  2.272622
1  1.249258 -1.316489  1.882245 -0.052090
2  0.221737 -1.504714  1.020847  0.896393
3  0.179393 -0.697228  0.440300  0.812013
4 -0.081299  0.730563  0.419408 -0.852893
5 -1.280963 -0.672726 -0.195521 -1.241587
6 -0.885539 -1.244709 -1.571865 -0.016223

pieces = [df[:2], df[2:4], df[4:]]


[          0         1         2         3
 0  1.405395  0.851792  0.259943  2.272622
 1  1.249258 -1.316489  1.882245 -0.052090,
           0         1         2         3
 2  0.221737 -1.504714  1.020847  0.896393
 3  0.179393 -0.697228  0.440300  0.812013,
           0         1         2         3
 4 -0.081299  0.730563  0.419408 -0.852893
 5 -1.280963 -0.672726 -0.195521 -1.241587
 6 -0.885539 -1.244709 -1.571865 -0.016223]

In [36]:
print("\npd.concat(pieces)")
pd.concat(pieces)


pd.concat(pieces)


Unnamed: 0,0,1,2,3
0,1.405395,0.851792,0.259943,2.272622
1,1.249258,-1.316489,1.882245,-0.05209
2,0.221737,-1.504714,1.020847,0.896393
3,0.179393,-0.697228,0.4403,0.812013
4,-0.081299,0.730563,0.419408,-0.852893
5,-1.280963,-0.672726,-0.195521,-1.241587
6,-0.885539,-1.244709,-1.571865,-0.016223


### Working with Missing Data

### np.nan is used to represent missing values

Mssing values can be represented by: null, NaN, na

In [43]:
print("df_ = pd.DataFrame({'x': [1, 2, np.nan, 4, np.nan]})")
df_ = pd.DataFrame({'x': [1, 2, np.nan, 4, np.nan]})
df_

df_ = pd.DataFrame({'x': [1, 2, np.nan, 4, np.nan]})


Unnamed: 0,x
0,1.0
1,2.0
2,
3,4.0
4,


In [44]:
df_["y"]=[2,np.nan,4,5, np.nan]
df_

Unnamed: 0,x,y
0,1.0,2.0
1,2.0,
2,,4.0
3,4.0,5.0
4,,


In [46]:
df_['x']=df_['x'].fillna(value=)

In [50]:
df_[['x']]

Unnamed: 0,x
0,1.0
1,2.0
2,0.0
3,4.0
4,0.0


In [53]:
x_df = df_['x']

In [54]:
x_df['x'] = df_['x']

### Dropping rows that contain missing values 

In [56]:
help(pd.DataFrame.dropna)

Help on function dropna in module pandas.core.frame:

dropna(self, *, axis: 'Axis' = 0, how: 'str | NoDefault' = <no_default>, thresh: 'int | NoDefault' = <no_default>, subset: 'IndexLabel' = None, inplace: 'bool' = False) -> 'DataFrame | None'
    Remove missing values.
    
    See the :ref:`User Guide <missing_data>` for more on which values are
    considered missing, and how to work with missing data.
    
    Parameters
    ----------
    axis : {0 or 'index', 1 or 'columns'}, default 0
        Determine if rows or columns which contain missing values are
        removed.
    
        * 0, or 'index' : Drop rows which contain missing values.
        * 1, or 'columns' : Drop columns which contain missing value.
    
        .. versionchanged:: 1.0.0
    
           Pass tuple or list to drop on multiple axes.
           Only a single axis is allowed.
    
    how : {'any', 'all'}, default 'any'
        Determine if row or column is removed from DataFrame, when we have
        at l

In [55]:
df_.dropna(how='all')


Unnamed: 0,x,y
0,1.0,2.0
2,0.0,4.0
3,4.0,5.0


### and you can fill the missing values with fillna..

<b>  filling nas with mean

In [10]:
df_.mean()

x    2.333333
y    3.666667
dtype: float64

In [37]:
df_.fillna(value=df_.mean())

Unnamed: 0,x
0,1.0
1,2.0
2,2.333333
3,4.0


In [38]:
df_

Unnamed: 0,x
0,1.0
1,2.0
2,
3,4.0


In [41]:
df_

Unnamed: 0,x
0,1.0
1,2.0
2,
3,4.0


In [40]:
help(pd.DataFrame.fillna)

Help on function fillna in module pandas.core.frame:

fillna(self, value: 'Hashable | Mapping | Series | DataFrame' = None, *, method: 'FillnaOptions | None' = None, axis: 'Axis | None' = None, inplace: 'bool' = False, limit: 'int | None' = None, downcast: 'dict | None' = None) -> 'DataFrame | None'
    Fill NA/NaN values using the specified method.
    
    Parameters
    ----------
    value : scalar, dict, Series, or DataFrame
        Value to use to fill holes (e.g. 0), alternately a
        dict/Series/DataFrame of values specifying which value to use for
        each index (for a Series) or column (for a DataFrame).  Values not
        in the dict/Series/DataFrame will not be filled. This value cannot
        be a list.
    method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
        Method to use for filling holes in reindexed Series
        pad / ffill: propagate last valid observation forward to next valid
        backfill / bfill: use next valid observation to f

In [18]:
df_ = pd.DataFrame({'x': [1, 2, np.nan, 4]})
print('df_.fillna(value=0)\n',df_.fillna(value=0))
print("\ndf_.fillna(method=ffill)\n",df_.fillna(method='ffill'))
print("\ndf_.fillna(method=bfill)\n",df_.fillna(method='bfill'))

df_.fillna(value=0)
      x
0  1.0
1  2.0
2  0.0
3  4.0

df_.fillna(method=ffill)
      x
0  1.0
1  2.0
2  2.0
3  4.0

df_.fillna(method=bfill)
      x
0  1.0
1  2.0
2  4.0
3  4.0


## Dummy variables
- Dummy variables are binary variables that represent the values that categorical variables hold. 
- It is an efficient representation for programming, preprocessing, analysing and ML tasks purposes.

#### Here is an example...

In [None]:
df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
                   'C': [1, 2, 3]})
df

In [None]:
pd.get_dummies(df)

In [None]:
df.apply(np.unique)

### I/O operations
- reading data directly from csv, text, Excel files, DFS
- Pandas Data Frames are not persistent storage!
- writing data directly to file storage.

In [None]:
students_df

In [None]:
students_df.to_csv("students_df.csv", index=False)

In [None]:
Image("res/Q&A.png")

In [None]:
pd.read_csv('students_df.csv')

In [61]:
name =['Francesco','Philipp','Joanna','Francesco','Joanna','Francesco']
sale_price=[50,80,65,70,95,100]
day_of_the_week= ['Monday','Friday','Tuesday','Friday','Tuesday','Friday']
dict_ = {'name':name,'sale_price':sale_price, 'day':day_of_the_week}
sales_df = pd.DataFrame(dict_)
sales_df

Unnamed: 0,name,sale_price,day
0,Francesco,50,Monday
1,Philipp,80,Friday
2,Joanna,65,Tuesday
3,Francesco,70,Friday
4,Joanna,95,Tuesday
5,Francesco,100,Friday


In [64]:
sales_df.pivot_table(index='name',values='sale_price',  columns = 'day',aggfunc= 'sum')

day,Friday,Monday,Tuesday
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Francesco,170.0,50.0,
Joanna,,,160.0
Philipp,80.0,,


In [65]:
help(pd.DataFrame.pivot_table)

Help on function pivot_table in module pandas.core.frame:

pivot_table(self, values=None, index=None, columns=None, aggfunc='mean', fill_value=None, margins=False, dropna=True, margins_name='All', observed=False, sort=True) -> 'DataFrame'
    Create a spreadsheet-style pivot table as a DataFrame.
    
    The levels in the pivot table will be stored in MultiIndex objects
    (hierarchical indexes) on the index and columns of the result DataFrame.
    
    Parameters
    ----------
    values : column to aggregate, optional
    index : column, Grouper, array, or list of the previous
        If an array is passed, it must be the same length as the data. The
        list can contain any of the other types (except list).
        Keys to group by on the pivot table index.  If an array is passed,
        it is being used as the same manner as column values.
    columns : column, Grouper, array, or list of the previous
        If an array is passed, it must be the same length as the data. The

In [67]:
sales_df.shape

(6, 3)