# Demo - Data Structuring Issues and Fixes

In [1]:
#Import pandas as pd
import pandas as pd

In [2]:
#Read all sheets in the Excel file
example_data = pd.read_excel('data_structuring_demo.xlsx', sheet_name=None)

## Unpivoting/Melting

In [3]:
example1_data = example_data['Unpivot Example']
cleaned_example1 = example1_data.copy()
cleaned_example1

Unnamed: 0,Name,<50,50-70,70-90,90-100
0,Amy Linn,1,4,0,0
1,Marc Fletcher,2,3,0,0
2,Naima Berry,0,0,2,3
3,John Carter,1,2,2,0


In [4]:
cleaned_example1 = cleaned_example1.melt(id_vars=['Name'], 
                                         var_name='Binned Score',
                                         value_name='Frequency')
cleaned_example1

Unnamed: 0,Name,Binned Score,Frequency
0,Amy Linn,<50,1
1,Marc Fletcher,<50,2
2,Naima Berry,<50,0
3,John Carter,<50,1
4,Amy Linn,50-70,4
5,Marc Fletcher,50-70,3
6,Naima Berry,50-70,0
7,John Carter,50-70,2
8,Amy Linn,70-90,0
9,Marc Fletcher,70-90,0


## Pivoting

In [5]:
example2_data = example_data['Pivot Example']
example2_data

Unnamed: 0,Product Classification,Product,Year,Revenue
0,Early Prototype,C,2021,0
1,Early Prototype,A,2021,0
2,Pilot,B,2021,3885
3,Pilot,A,2022,2193
4,Pilot,B,2022,4224
5,Product,A,2023,3918
6,Product,B,2023,5093


In [6]:
example2_data.pivot(index=["Product Classification",
                           "Product"], 
                    columns="Year",
                    values="Revenue")

Unnamed: 0_level_0,Year,2021,2022,2023
Product Classification,Product,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Early Prototype,A,0.0,,
Early Prototype,C,0.0,,
Pilot,A,,2193.0,
Pilot,B,3885.0,4224.0,
Product,A,,,3918.0
Product,B,,,5093.0


## Transpose

In [7]:
transpose_ex = example_data['Tranpose Example']
transpose_ex

Unnamed: 0,ID,1,2,3,4
0,Students,Amy Linn,Marc Fletcher,Naima Barry,John Smith
1,Test Score,95,50,100,73


In [8]:
transpose_ex.T

Unnamed: 0,0,1
ID,Students,Test Score
1,Amy Linn,95
2,Marc Fletcher,50
3,Naima Barry,100
4,John Smith,73


In [9]:
transposed_df = transpose_ex.set_index('ID').T
transposed_df

ID,Students,Test Score
1,Amy Linn,95
2,Marc Fletcher,50
3,Naima Barry,100
4,John Smith,73


## Merging

In [10]:
merging_ex_1 = example_data['Merge Example 1']
merging_ex_1

Unnamed: 0,ID,Movie,Viewer
0,0,The Wizard of Oz (1939),"Mark,Mary"
1,1,Get Out (2017),"Tariq,Candice"
2,2,The Wizard of Oz (1939),Olga
3,3,Dunkirk (2017),"Candice,Tariq"
4,4,The Jungle Book (2016),Olga
5,5,High Noon (1952),Aaron
6,6,Get Out (2017),Olga
7,7,The Wizard of Oz (1939),Aaron


In [11]:
merging_ex_2 = example_data['Merge Example 2']
merging_ex_2

Unnamed: 0,ID,Review,Rating
0,0,"Great movie, excellent plot!",5
1,1,Could have had better character development.,3
2,2,Ok.,Not Collected
3,3,"I loved it, recommended it to all my friends!",5
4,4,"A great movie, but I felt the plot was rushed.",4
5,5,Will not watch again.,1
6,6,Loved it!,Not Collected
7,7,Timeless!,Not Collected


In [12]:
merged = pd.merge(merging_ex_1, merging_ex_2)
merged

Unnamed: 0,ID,Movie,Viewer,Review,Rating
0,0,The Wizard of Oz (1939),"Mark,Mary","Great movie, excellent plot!",5
1,1,Get Out (2017),"Tariq,Candice",Could have had better character development.,3
2,2,The Wizard of Oz (1939),Olga,Ok.,Not Collected
3,3,Dunkirk (2017),"Candice,Tariq","I loved it, recommended it to all my friends!",5
4,4,The Jungle Book (2016),Olga,"A great movie, but I felt the plot was rushed.",4
5,5,High Noon (1952),Aaron,Will not watch again.,1
6,6,Get Out (2017),Olga,Loved it!,Not Collected
7,7,The Wizard of Oz (1939),Aaron,Timeless!,Not Collected


## Appending

In [13]:
append_ex_1 = example_data['Appending Example 1']
append_ex_1

Unnamed: 0,Name,Age,Test Score
0,Amy Linn,14.0,95
1,Marc Fletcher,15.0,50
2,Naima Berry,,100


In [14]:
append_ex_2 = example_data['Appending Example 2']
append_ex_2

Unnamed: 0,Name,Age,Test Score
0,John Carter,14,
1,Dewey Cobb,14,100.0
2,Amy Linn,14,85.0


In [15]:
appended_df = pd.concat([append_ex_1, append_ex_2], ignore_index=True)
appended_df

Unnamed: 0,Name,Age,Test Score
0,Amy Linn,14.0,95.0
1,Marc Fletcher,15.0,50.0
2,Naima Berry,,100.0
3,John Carter,14.0,
4,Dewey Cobb,14.0,100.0
5,Amy Linn,14.0,85.0


In [16]:
#Jumbled up index
pd.concat([append_ex_1, append_ex_2])

Unnamed: 0,Name,Age,Test Score
0,Amy Linn,14.0,95.0
1,Marc Fletcher,15.0,50.0
2,Naima Berry,,100.0
0,John Carter,14.0,
1,Dewey Cobb,14.0,100.0
2,Amy Linn,14.0,85.0


## Group-by and Aggregation

In [17]:
groupby_ex = example_data['Groupby-Agg Example']
groupby_ex

Unnamed: 0,date,score
0,March,9
1,March,1
2,March,3
3,April,5
4,April,6
5,April,4


In [18]:
groupby_ex.groupby('date', sort=False)['score'].agg(['sum','mean'])

Unnamed: 0_level_0,sum,mean
date,Unnamed: 1_level_1,Unnamed: 2_level_1
March,13,4.333333
April,15,5.0


## Bonus: Advanced merging

In [2]:
import pandas as pd

#Read all sheets in the Excel file
advanced_merge = pd.read_excel('advanced_merge_example.xlsx', sheet_name=None)
adv_merge_1 = advanced_merge['Sheet1']
adv_merge_2 = advanced_merge['Sheet2']

In [3]:
adv_merge_1

Unnamed: 0,Movie Audience,Movie ID,Movie Rating
0,Kids,0,G
1,Adults,1,R
2,Teens,2,PG-13
3,Kids,3,PG
4,Kids,4,PG


In [4]:
adv_merge_2

Unnamed: 0,Price (dollars),Movie ID,Movie Title,Score
0,14,0,The Wizard of Oz (1939)\t,5
1,26,1,Get Out (2017),3
2,12,3,Dunkirk (2017)\t,2
3,5,4,The Jungle Book (2016)\t,5
4,15,5,High Noon (1952)\t,4


In [5]:
#Merge the two dataframes using only keys from the right frame
advm_ex = pd.merge(adv_merge_1, adv_merge_2, on=['Movie ID'], how='right')
advm_ex.head()

Unnamed: 0,Movie Audience,Movie ID,Movie Rating,Price (dollars),Movie Title,Score
0,Kids,0,G,14,The Wizard of Oz (1939)\t,5
1,Adults,1,R,26,Get Out (2017),3
2,Kids,3,PG,12,Dunkirk (2017)\t,2
3,Kids,4,PG,5,The Jungle Book (2016)\t,5
4,,5,,15,High Noon (1952)\t,4


In [12]:
#Merge the two dataframes using only keys from the left frame
advm_ex2 = pd.merge(adv_merge_1, adv_merge_2, on=['Movie ID'], how='left')
advm_ex2.head()

Unnamed: 0,Movie Audience,Movie ID,Movie Rating,Price (dollars),Movie Title,Score
0,Kids,0,G,14.0,The Wizard of Oz (1939)\t,5.0
1,Adults,1,R,26.0,Get Out (2017),3.0
2,Teens,2,PG-13,,,
3,Kids,3,PG,12.0,Dunkirk (2017)\t,2.0
4,Kids,4,PG,5.0,The Jungle Book (2016)\t,5.0


In [6]:
#Merge the two dataframes using the intersection of keys from both frames,
advm_ex3 = pd.merge(adv_merge_1, adv_merge_2, on=['Movie ID'], how='inner')
advm_ex3.head()

Unnamed: 0,Movie Audience,Movie ID,Movie Rating,Price (dollars),Movie Title,Score
0,Kids,0,G,14,The Wizard of Oz (1939)\t,5
1,Adults,1,R,26,Get Out (2017),3
2,Kids,3,PG,12,Dunkirk (2017)\t,2
3,Kids,4,PG,5,The Jungle Book (2016)\t,5
