### Data Wrangling with Python Video

In [1]:
import pandas as pd

### Concatenation

In [8]:
# create dummy dataframe
df1_dummy = {
    "serial_id": ["1", "2", "3", "4", "5"],
    "sale_month":["Jan", "Feb", "Mar", "Apr", "May"],
    "sales": ["12300", "25100", "17800", "20100", "21000"]
}

In [9]:
# create dataframe using df1_dummy data and the column names we want to use
df1 = pd.DataFrame(df1_dummy, columns = ["serial_id", "sale_month", "sales"]) 
df1

Unnamed: 0,serial_id,sale_month,sales
0,1,Jan,12300
1,2,Feb,25100
2,3,Mar,17800
3,4,Apr,20100
4,5,May,21000


In [5]:
# create 2nd dummy dataframe
df2_dummy = {
    "serial_id": ["6", "7", "8", "9", "10"],
    "sale_month":["Jun", "Jul", "Aug", "Sep", "Oct"],
    "sales": ["25000", "23700", "24600", "24000", "23950"]
}

In [10]:
df2 = pd.DataFrame(df2_dummy, columns = ["serial_id", "sale_month", "sales"]) 
df2

Unnamed: 0,serial_id,sale_month,sales
0,6,Jun,25000
1,7,Jul,23700
2,8,Aug,24600
3,9,Sep,24000
4,10,Oct,23950


In [11]:
# 3rd dummy df
# create 2nd dummy dataframe
df3_dummy = {
    "sales_threshold": ["No", "Yes", "No", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes"],
    "bonus_threshold": ["No", "Yes", "No", "No", "No", "Yes", "No", "Yes", "Yes", "No"],
}

In [16]:
df3 = pd.DataFrame(df3_dummy, columns = ["sales_threshold", "bonus_threshold"]) 
df3

Unnamed: 0,sales_threshold,bonus_threshold
0,No,No
1,Yes,Yes
2,No,No
3,Yes,No
4,Yes,No
5,Yes,Yes
6,Yes,No
7,Yes,Yes
8,Yes,Yes
9,Yes,No


In [19]:
# concatenate first and second DF row-wise
# add in ignore_index=True to avoid duplicate indices, w/out this it concatenates with indices 0-4 and 0-4 again and 
# this throws an error when trying to concatenate df_row and df3 in the next command
df_row = pd.concat([df1, df2], ignore_index=True) 
df_row

Unnamed: 0,serial_id,sale_month,sales
0,1,Jan,12300
1,2,Feb,25100
2,3,Mar,17800
3,4,Apr,20100
4,5,May,21000
5,6,Jun,25000
6,7,Jul,23700
7,8,Aug,24600
8,9,Sep,24000
9,10,Oct,23950


In [20]:
# concatenate new df with df3
df_full = pd.concat([df_row, df3], axis=1) # without reindexing, this throws an error b/c of duplicate index values
df_full

Unnamed: 0,serial_id,sale_month,sales,sales_threshold,bonus_threshold
0,1,Jan,12300,No,No
1,2,Feb,25100,Yes,Yes
2,3,Mar,17800,No,No
3,4,Apr,20100,Yes,No
4,5,May,21000,Yes,No
5,6,Jun,25000,Yes,Yes
6,7,Jul,23700,Yes,No
7,8,Aug,24600,Yes,Yes
8,9,Sep,24000,Yes,Yes
9,10,Oct,23950,Yes,No


### Append

#### append is deprecated
- https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html
- https://www.geeksforgeeks.org/how-to-concatenate-two-or-more-pandas-dataframes/

In [24]:
#df_append = df1.append(df2) # append is deprecated, use concat instead to accomplish the same thing
df_append = pd.concat([df1, df2])
df_append


Unnamed: 0,serial_id,sale_month,sales
0,1,Jan,12300
1,2,Feb,25100
2,3,Mar,17800
3,4,Apr,20100
4,5,May,21000
0,6,Jun,25000
1,7,Jul,23700
2,8,Aug,24600
3,9,Sep,24000
4,10,Oct,23950


In [27]:
df_append_full = pd.concat([df_append, df3])
df_append_full

Unnamed: 0,serial_id,sale_month,sales,sales_threshold,bonus_threshold
0,1.0,Jan,12300.0,,
1,2.0,Feb,25100.0,,
2,3.0,Mar,17800.0,,
3,4.0,Apr,20100.0,,
4,5.0,May,21000.0,,
0,6.0,Jun,25000.0,,
1,7.0,Jul,23700.0,,
2,8.0,Aug,24600.0,,
3,9.0,Sep,24000.0,,
4,10.0,Oct,23950.0,,
