# <u>Operations.

There are lots of operations with pandas that will be really useful to us, but don't fall into any distinct category. Let's show them here in this lecture:

# <u>Creating a DataFrame.

In [3]:
import numpy as np
import pandas as pd

In [4]:
# 444 value is repeated in col2.

df = pd.DataFrame({'col1':[1,2,3,4], 'col2':[444,555,666,444], 'col3':['abc','def','ghi','xyz']})
df.head()

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


### df.head() → This is a method that returns only the first 5 rows of the DataFrame by default.

    Useful when the dataset is large, and we only want to quickly inspect the top few rows.

    We can also specify how many rows to see, e.g. df.head(2) → shows the first 2 rows.

---

# <u>Info on Unique Values.

### unique():

    This method returns a numpy array of unique values in the indexed column.

In [9]:
df['col2'].unique()

array([444, 555, 666], dtype=int64)

### nunique():

    This method returns number of unique values in the indexed column.

In [11]:
df['col2'].nunique()

3

In [12]:
# Or we could use the len() method.

len(df['col2'].unique())

3

### value_counts():

    This method returns a series object showing how many times each unique value occurred in the indexed column.

In [14]:
df['col2'].value_counts()

col2
444    2
555    1
666    1
Name: count, dtype: int64

---

# <u>Selecting Data using Conditional Selection (Quick Reminder from previous lectures).

In [17]:
df

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


In [18]:
df['col1'] > 2

0    False
1    False
2     True
3     True
Name: col1, dtype: bool

In [19]:
df['col2'] == 444

0     True
1    False
2    False
3     True
Name: col2, dtype: bool

In [20]:
#Select from DataFrame using criteria from multiple columns

newdf = df[(df['col1'] > 2) & (df['col2'] == 444)]

In [21]:
newdf

Unnamed: 0,col1,col2,col3
3,4,444,xyz


In [22]:
newdf1 = df[(df['col1'] > 2) | (df['col2'] == 444)]
newdf1

Unnamed: 0,col1,col2,col3
0,1,444,abc
2,3,666,ghi
3,4,444,xyz


---

# <u>Applying Functions.

In [25]:
# A function that returns the X2 times that number.

def times2(x):
    return x * 2

In [26]:
times2(2)

4

In [27]:
df

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


In [28]:
# Built-in function.
# We already know this.

df['col1'].sum()

10

### apply(func_1, axis = 0):

    This method allows us to apply a function along an axis of the DataFrame.

In [30]:
df['col1'].apply(times2)

0    2
1    4
2    6
3    8
Name: col1, dtype: int64

**We can also apply built-in functions such as len():**

In [32]:
# Returns the length of string of every value in col3.

df['col3'].apply(len)

0    3
1    3
2    3
3    3
Name: col3, dtype: int64

### apply is going to be especially powerful when we combine it with lambda expressions.

**QUICK REVISIT OF LAMBDA EXPRESSION**

map(): Applies a specified function to every item in an iterable.
Syntax: 
    
    for item in map(func, iterables):
        print(item)

    OR

    list(map(func, iterables))

filter():Returns an iterator yeilding those items of the iterable for which when we pass in the item into the function, it's true.
Syntax:

    for item in filter(func, iterables):
        print(item)    

    OR

    list(filter(func, iterables))

lambda expressions: Anonymous functions.
Syntax:

    lambda arguments: expression

    Ex:

    def square(num):
        return num**2

    Convert into lambda expression:

    def square(num): return num**2 ---> lambda num: num**2 ---> square = lambda num: num**2
    Run: square(5)

map() w lambda expression:

    my_nums = [1,2,3,4,5]

    list(map(lambda num: num**2, my_nums))
    
filter() w lambda expression:

    list(filter(lambda num: num%2==0, my_nums))

### Example: Return the values in column 2 of the DataFrame(df) times 2 using lambda exprssion.

In [36]:
df['col2'].apply(lambda num: num * 2)

0     888
1    1110
2    1332
3     888
Name: col2, dtype: int64

- NOTE:

    - Don't use lambda expressions for complex functions.

---

# <u>Permanently Removing a Column.

In [40]:
df

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


In [41]:
# For a column.

df.drop('col3', axis = 1, inplace = False)

Unnamed: 0,col1,col2
0,1,444
1,2,555
2,3,666
3,4,444


In [42]:
# For a row.

df.drop(3, axis = 0, inplace = False)

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi


In [43]:
# Or We can use the del keyword.
# Commented out since we want our orignal DataFrame.

# del df['col1']

In [44]:
df

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


---

# <u>Get column and index names:

This is going to be especially useful when we're trying to index a column or know the number of rows if the DataFrame is quite large.

In [48]:
# .columns = attribute

df.columns

Index(['col1', 'col2', 'col3'], dtype='object')

In [49]:
# start = inclusive, stop = exclusive

df.index

RangeIndex(start=0, stop=4, step=1)

---

# <u>Sorting and Ordering a DataFrame:

Sort by the values.

Syntax: df.sort_values(by = 'IndexLabel', inplace = False)

In [52]:
df

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


In [53]:
df.sort_values(by = 'col2') # inplace = False by default

Unnamed: 0,col1,col2,col3
0,1,444,abc
3,4,444,xyz
1,2,555,def
2,3,666,ghi


In [54]:
df.sort_values('col2')

Unnamed: 0,col1,col2,col3
0,1,444,abc
3,4,444,xyz
1,2,555,def
2,3,666,ghi


In [55]:
df.sort_values('col1')

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


In [56]:
df.sort_values('col3')

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


- <u>NOTE:

    - Notice how the index stays attached to the row.
    - So we don't ever lose that information there.

---

# <u>Find Null Values or Check for Null Values.

isnull():

    This method returns a DataFrame of boolean values depending on NaN and non-NaN values.

In [61]:
df

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


In [62]:
df.isnull()

Unnamed: 0,col1,col2,col3
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False


---

# <u>dropna and fillna

In [65]:
df

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


In [66]:
# Drop rows with NaN Values
# Syntax: df.dropna(axis = 0, thresh = int(None), inplace = False)
# thresh = Requires that many non-NaN values to not drop the rows or columns (Threshold).

df.dropna()

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


In [67]:
df = pd.DataFrame({'col1': [1,2,3,np.nan],
                   'col2': [np.nan,555,666,444],
                   'col3': ['abc','def','ghi','xyz']})
df.head()

Unnamed: 0,col1,col2,col3
0,1.0,,abc
1,2.0,555.0,def
2,3.0,666.0,ghi
3,,444.0,xyz


In [68]:
# Syntax: df.fillna(value = 'FILL VALUE', method,  axis = 0, inplace = False)
# OR Syntax: df/indexed_column_or_row.fillna(value = 'FILL VALUE', method, inplace = False)

df.fillna('FILL')

Unnamed: 0,col1,col2,col3
0,1.0,FILL,abc
1,2.0,555.0,def
2,3.0,666.0,ghi
3,FILL,444.0,xyz


---

# <u>pivot_table(): 

    Creates a spreadsheet-style pivot table as a DataFrame.

    Syntax: df.pivot_table(values, index = ['A', 'B'], columns = ['C'])
            where;
                    values = Column or columns to aggregate.

In [71]:
data = {'A':['foo','foo','foo','bar','bar','bar'],
        'B':['one','one','two','two','one','one'],
        'C':['x','y','x','y','x','y'],
        'D':[1,3,2,5,4,1]}

df = pd.DataFrame(data)

In [72]:
df

Unnamed: 0,A,B,C,D
0,foo,one,x,1
1,foo,one,y,3
2,foo,two,x,2
3,bar,two,y,5
4,bar,one,x,4
5,bar,one,y,1


- <u>NOTE:

    - We have repeating values in column 'A', 'B' and 'C'.
    - Basically what we're going to do is just create a multi index out of this DataFrame.

In [74]:
df.pivot_table(values = 'D', index = ['A', 'B'], columns = ['C'])

Unnamed: 0_level_0,C,x,y
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,4.0,1.0
bar,two,,5.0
foo,one,1.0,3.0
foo,two,2.0,


- <u>NOTE:

    - We created a multi-index by doing index = ['A', 'B'].
    - And created columns of x and y by doing columns = ['C']
    - And we are aggregating on column 'D'.

---