In [0]:
import pandas as pd



In [0]:
%fs 
cp dbfs:/FileStore/shared_uploads/zaderohish5@gmail.com/orders.txt file:/databricks/driver/

In [0]:
df = pd.read_csv('orders.txt')
df

Unnamed: 0,order_id,order_date,customer_name,city,category,product_id,sales,profit
0,CA-2018-100006,2018-09-07,Dennis Kane,New York City,Technology,TEC-PH-10002075,377.970,109.6113
1,CA-2018-100090,2018-07-08,Ed Braxton,San Francisco,Furniture,FUR-TA-10003715,502.488,-87.9354
2,CA-2018-100293,2018-03-14,Neil Französisch,Jacksonville,Office Supplies,OFF-PA-10000176,91.056,31.8696
3,CA-2018-100328,2018-01-28,Jasper Cacioppo,New York City,Office Supplies,OFF-BI-10000343,3.928,1.3257
4,CA-2018-100363,2018-04-08,Jim Mitchum,Glendale,Office Supplies,OFF-FA-10000611,2.368,0.8288
...,...,...,...,...,...,...,...,...
5004,US-2021-168802,2021-11-03,Jack O'Briant,Seattle,Office Supplies,OFF-BI-10002393,18.368,5.9696
5005,US-2021-169320,2021-07-23,Lena Hernandez,Elkhart,Office Supplies,OFF-AR-10003602,11.680,5.4896
5006,US-2021-169488,2021-09-07,Allen Armold,Providence,Office Supplies,OFF-PA-10000157,39.960,18.7812
5007,US-2021-169502,2021-08-28,Matthew Grinstein,Milwaukee,Office Supplies,OFF-AP-10001947,91.600,26.5640


### Pandas data aggregations
Aggregations in Python using Pandas allow you to compute summary statistics like mean, sum, count, etc., across rows or columns in a DataFrame.

#### Basic Aggregations

In [0]:
# Sum: Computes the sum of values.
df['sales'].sum()

Out[4]: 1107592.9048

In [0]:
type(df['sales'])

Out[5]: pandas.core.series.Series

In [0]:
# Mean: Computes the average of values.
df['sales'].mean()

Out[7]: 221.12056394489917

In [0]:
# Median: Computes the median of values.
df['sales'].median()

Out[8]: 56.52

In [0]:
# Min: Finds the minimum value
df['sales'].min()

Out[9]: 0.444

In [0]:
# Max: Finds the maximum value
df['sales'].max()

Out[10]: 10499.97

In [0]:
# Count: Counts the number of non-null values.
df['sales'].count()

Out[12]: 5009

In [0]:
fc = (df['category'] == 'Technology')
df.loc[fc ,'sales'].sum()

Out[13]: 180636.84800000003

#### Grouped Aggregations
You can also perform these operations grouped by a specific column.

In [0]:
df_category_sales = df.groupby('category')['sales'].sum()
df_category_sales

Out[16]: category
Furniture          628359.3048
Office Supplies    298596.7520
Technology         180636.8480
Name: sales, dtype: float64

In [0]:
type(df_category_sales)

Out[17]: pandas.core.series.Series

In [0]:
# series to dataframe
pd.DataFrame({'category':df_category_sales.index, 'sales':df_category_sales.values})

Unnamed: 0,category,sales
0,Furniture,628359.3048
1,Office Supplies,298596.752
2,Technology,180636.848


In [0]:
cat_sales_df = df.groupby(['category', 'city'])['sales'].sum()
cat_sales_df

Out[22]: category    city      
Furniture   Akron          433.5960
            Alexandria     204.6400
            Allen          244.0060
            Allentown       11.6480
            Amarillo      3048.5828
                            ...    
Technology  Wichita        224.7500
            Wilmington     694.3560
            Woodland       239.9840
            Yonkers        216.4000
            Yuma           785.5130
Name: sales, Length: 987, dtype: float64

In [0]:
# here we have MultiIndex and each index is a tuple
cat_sales_df.index

Out[23]: MultiIndex([( 'Furniture',           'Akron'),
            ( 'Furniture',      'Alexandria'),
            ( 'Furniture',           'Allen'),
            ( 'Furniture',       'Allentown'),
            ( 'Furniture',        'Amarillo'),
            ( 'Furniture',         'Anaheim'),
            ( 'Furniture',         'Andover'),
            ( 'Furniture',          'Apopka'),
            ( 'Furniture',    'Apple Valley'),
            ( 'Furniture',       'Arlington'),
            ...
            ('Technology',         'Visalia'),
            ('Technology',         'Warwick'),
            ('Technology', 'West Palm Beach'),
            ('Technology',     'Westminster'),
            ('Technology',        'Whittier'),
            ('Technology',         'Wichita'),
            ('Technology',      'Wilmington'),
            ('Technology',        'Woodland'),
            ('Technology',         'Yonkers'),
            ('Technology',            'Yuma')],
           names=['category', 'cit

#### Multiple Aggregations
You can apply multiple aggregations at once using the `agg` function.

In [0]:
df['sales'].agg(['sum', 'min', 'max', 'mean'])

Out[25]: sum     1.107593e+06
min     4.440000e-01
max     1.049997e+04
mean    2.211206e+02
Name: sales, dtype: float64

In [0]:
# For grouped data:
df.groupby('category')['sales'].agg(['sum', 'min', 'max'])

Unnamed: 0_level_0,sum,min,max
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Furniture,628359.3048,1.892,4416.174
Office Supplies,298596.752,0.444,9892.74
Technology,180636.848,4.95,10499.97


In [0]:
df.groupby(['category', 'city'])['sales'].agg(['sum', 'min', 'max'])

Unnamed: 0_level_0,Unnamed: 1_level_0,sum,min,max
category,city,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Furniture,Akron,433.5960,149.232,284.364
Furniture,Alexandria,204.6400,12.420,192.220
Furniture,Allen,244.0060,244.006,244.006
Furniture,Allentown,11.6480,11.648,11.648
Furniture,Amarillo,3048.5828,23.076,2453.430
...,...,...,...,...
Technology,Wichita,224.7500,224.750,224.750
Technology,Wilmington,694.3560,302.376,391.980
Technology,Woodland,239.9840,239.984,239.984
Technology,Yonkers,216.4000,52.440,163.960


#### Aggregating Multiple Columns
You can also aggregate multiple columns with different functions.

In [0]:
#syntax
# df.agg({
#     'column_1': 'sum',
#     'column_2': ['mean', 'min'],
#     'column_3': 'count'
# })

In [0]:
df.agg({
    'sales': ['sum','count'],
    'profit': ['mean', 'min']
})

Unnamed: 0,sales,profit
sum,1107593.0,
count,5009.0,
mean,,21.032394
min,,-2929.4845


In [0]:
# here the groupby column will be treated as index in the result set by default
df.groupby('category').agg({'sales':'sum', 'profit':'max'})

Unnamed: 0_level_0,sales,profit
category,Unnamed: 1_level_1,Unnamed: 2_level_1
Furniture,628359.3048,1013.127
Office Supplies,298596.752,4946.37
Technology,180636.848,5039.9856


In [0]:
# Here, group by column is not an index by default 
df.groupby('category', as_index=False).agg({'sales':'sum', 'profit':'max'})

Unnamed: 0,category,sales,profit
0,Furniture,628359.3048,1013.127
1,Office Supplies,298596.752,4946.37
2,Technology,180636.848,5039.9856


In [0]:
df.groupby(['category','city'], as_index=False).agg({'sales':'sum'})

Unnamed: 0,category,city,sales
0,Furniture,Akron,433.5960
1,Furniture,Alexandria,204.6400
2,Furniture,Allen,244.0060
3,Furniture,Allentown,11.6480
4,Furniture,Amarillo,3048.5828
...,...,...,...
982,Technology,Wichita,224.7500
983,Technology,Wilmington,694.3560
984,Technology,Woodland,239.9840
985,Technology,Yonkers,216.4000


### Merge/Join Operation in Pandas
In Pandas, you can combine data from different DataFrames using merges or joins, which are similar to SQL joins.

**Types of Joins in Pandas:**
- `Inner Join`: Returns only the rows with matching keys in both DataFrames.
- `Left Join`: Returns all rows from the left DataFrame and the matched rows from the right DataFrame. Missing values in the right DataFrame are filled with NaN.
- `Right Join`: Returns all rows from the right DataFrame and the matched rows from the left DataFrame. Missing values in the left DataFrame are filled with NaN.
- `Outer Join`: Returns all rows when there is a match in either the left or right DataFrame. Missing values are filled with NaN.

**Basic Syntax**
- The `merge()` function is used to perform these joins.

**`pd.merge(left, right, how='type_of_join', on='column_name')`**


In [0]:
# Let's create two dataframes

#df1
df1 = pd.read_csv('orders.txt')
df1

Unnamed: 0,order_id,order_date,customer_name,city,category,product_id,sales,profit
0,CA-2018-100006,2018-09-07,Dennis Kane,New York City,Technology,TEC-PH-10002075,377.970,109.6113
1,CA-2018-100090,2018-07-08,Ed Braxton,San Francisco,Furniture,FUR-TA-10003715,502.488,-87.9354
2,CA-2018-100293,2018-03-14,Neil Französisch,Jacksonville,Office Supplies,OFF-PA-10000176,91.056,31.8696
3,CA-2018-100328,2018-01-28,Jasper Cacioppo,New York City,Office Supplies,OFF-BI-10000343,3.928,1.3257
4,CA-2018-100363,2018-04-08,Jim Mitchum,Glendale,Office Supplies,OFF-FA-10000611,2.368,0.8288
...,...,...,...,...,...,...,...,...
5004,US-2021-168802,2021-11-03,Jack O'Briant,Seattle,Office Supplies,OFF-BI-10002393,18.368,5.9696
5005,US-2021-169320,2021-07-23,Lena Hernandez,Elkhart,Office Supplies,OFF-AR-10003602,11.680,5.4896
5006,US-2021-169488,2021-09-07,Allen Armold,Providence,Office Supplies,OFF-PA-10000157,39.960,18.7812
5007,US-2021-169502,2021-08-28,Matthew Grinstein,Milwaukee,Office Supplies,OFF-AP-10001947,91.600,26.5640


In [0]:
# df2
df2 = pd.read_csv('returns.txt')
df2

Unnamed: 0,order_id,return_reason
0,CA-2020-104689,Wrong Items
1,CA-2020-105081,Wrong Items
2,CA-2020-105291,Wrong Items
3,CA-2020-105585,Wrong Items
4,CA-2020-106950,Wrong Items
...,...,...
291,US-2021-136679,Others
292,US-2021-147886,Others
293,US-2021-147998,Wrong Items
294,US-2021-151127,Wrong Items


In [0]:
# By default the pandas perform inner join
pd.merge(df1, df2, on='order_id')

Unnamed: 0,order_id,order_date,customer_name,city,category,product_id,sales,profit,return_reason
0,CA-2018-100762,2018-11-24,Nat Gilpin,Jackson,Office Supplies,OFF-AR-10000380,151.920,45.5760,Bad Quality
1,CA-2018-100867,2018-10-19,Eugene Hildebrand,Lakewood,Technology,TEC-PH-10004922,321.552,20.0970,Bad Quality
2,CA-2018-102652,2018-04-06,Andy Yotov,Los Angeles,Furniture,FUR-FU-10000747,91.960,15.6332,Bad Quality
3,CA-2018-103373,2018-05-18,Bruce Stewart,Cleveland,Technology,TEC-PH-10002885,779.796,-168.9558,Bad Quality
4,CA-2018-103744,2018-02-23,Michael Grace,El Paso,Office Supplies,OFF-BI-10000320,4.428,-6.8634,Bad Quality
...,...,...,...,...,...,...,...,...,...
291,US-2021-136679,2021-11-14,Xylona Preis,Pasadena,Office Supplies,OFF-AR-10003582,45.040,4.5040,Others
292,US-2021-147886,2021-03-28,Dave Hallsten,Fairfield,Furniture,FUR-FU-10001095,26.480,10.0624,Others
293,US-2021-147998,2021-05-19,Sue Ann Reed,San Jose,Office Supplies,OFF-BI-10002082,133.120,49.9200,Wrong Items
294,US-2021-151127,2021-05-22,Rob Lucas,Los Angeles,Office Supplies,OFF-AR-10002445,49.560,18.8328,Wrong Items


**If the columns to join on have different names in the two DataFrames, use left_on and right_on.**

In [0]:
pd.merge(df1, df2, how='inner', left_on='order_id', right_on='order_id')

Unnamed: 0,order_id,order_date,customer_name,city,category,product_id,sales,profit,return_reason
0,CA-2018-100762,2018-11-24,Nat Gilpin,Jackson,Office Supplies,OFF-AR-10000380,151.920,45.5760,Bad Quality
1,CA-2018-100867,2018-10-19,Eugene Hildebrand,Lakewood,Technology,TEC-PH-10004922,321.552,20.0970,Bad Quality
2,CA-2018-102652,2018-04-06,Andy Yotov,Los Angeles,Furniture,FUR-FU-10000747,91.960,15.6332,Bad Quality
3,CA-2018-103373,2018-05-18,Bruce Stewart,Cleveland,Technology,TEC-PH-10002885,779.796,-168.9558,Bad Quality
4,CA-2018-103744,2018-02-23,Michael Grace,El Paso,Office Supplies,OFF-BI-10000320,4.428,-6.8634,Bad Quality
...,...,...,...,...,...,...,...,...,...
291,US-2021-136679,2021-11-14,Xylona Preis,Pasadena,Office Supplies,OFF-AR-10003582,45.040,4.5040,Others
292,US-2021-147886,2021-03-28,Dave Hallsten,Fairfield,Furniture,FUR-FU-10001095,26.480,10.0624,Others
293,US-2021-147998,2021-05-19,Sue Ann Reed,San Jose,Office Supplies,OFF-BI-10002082,133.120,49.9200,Wrong Items
294,US-2021-151127,2021-05-22,Rob Lucas,Los Angeles,Office Supplies,OFF-AR-10002445,49.560,18.8328,Wrong Items


In [0]:
# another syntax
# pd.merge(left=df1, right=df2, how='inner', left_on='order_id', right_on='order_id')
# pd.merge(left=df1, right=df2, left_on='order_id', right_on='order_id')
# pd.merge(left=df1, right=df2, on='order_id')
# pd.merge(left=df1, right=df2, how='inner', on='order_id')
pd.merge(right=df2, left=df1, how='left', on='order_id')

Unnamed: 0,order_id,order_date,customer_name,city,category,product_id,sales,profit,return_reason
0,CA-2018-100006,2018-09-07,Dennis Kane,New York City,Technology,TEC-PH-10002075,377.970,109.6113,
1,CA-2018-100090,2018-07-08,Ed Braxton,San Francisco,Furniture,FUR-TA-10003715,502.488,-87.9354,
2,CA-2018-100293,2018-03-14,Neil Französisch,Jacksonville,Office Supplies,OFF-PA-10000176,91.056,31.8696,
3,CA-2018-100328,2018-01-28,Jasper Cacioppo,New York City,Office Supplies,OFF-BI-10000343,3.928,1.3257,
4,CA-2018-100363,2018-04-08,Jim Mitchum,Glendale,Office Supplies,OFF-FA-10000611,2.368,0.8288,
...,...,...,...,...,...,...,...,...,...
5004,US-2021-168802,2021-11-03,Jack O'Briant,Seattle,Office Supplies,OFF-BI-10002393,18.368,5.9696,
5005,US-2021-169320,2021-07-23,Lena Hernandez,Elkhart,Office Supplies,OFF-AR-10003602,11.680,5.4896,
5006,US-2021-169488,2021-09-07,Allen Armold,Providence,Office Supplies,OFF-PA-10000157,39.960,18.7812,
5007,US-2021-169502,2021-08-28,Matthew Grinstein,Milwaukee,Office Supplies,OFF-AP-10001947,91.600,26.5640,


##### Inner Join
This will only return rows where the key column matches in both DataFrames.

In [0]:
inner_df = pd.merge(df1, df2, how='inner', on='order_id')
inner_df

Unnamed: 0,order_id,order_date,customer_name,city,category,product_id,sales,profit,return_reason
0,CA-2018-100762,2018-11-24,Nat Gilpin,Jackson,Office Supplies,OFF-AR-10000380,151.920,45.5760,Bad Quality
1,CA-2018-100867,2018-10-19,Eugene Hildebrand,Lakewood,Technology,TEC-PH-10004922,321.552,20.0970,Bad Quality
2,CA-2018-102652,2018-04-06,Andy Yotov,Los Angeles,Furniture,FUR-FU-10000747,91.960,15.6332,Bad Quality
3,CA-2018-103373,2018-05-18,Bruce Stewart,Cleveland,Technology,TEC-PH-10002885,779.796,-168.9558,Bad Quality
4,CA-2018-103744,2018-02-23,Michael Grace,El Paso,Office Supplies,OFF-BI-10000320,4.428,-6.8634,Bad Quality
...,...,...,...,...,...,...,...,...,...
291,US-2021-136679,2021-11-14,Xylona Preis,Pasadena,Office Supplies,OFF-AR-10003582,45.040,4.5040,Others
292,US-2021-147886,2021-03-28,Dave Hallsten,Fairfield,Furniture,FUR-FU-10001095,26.480,10.0624,Others
293,US-2021-147998,2021-05-19,Sue Ann Reed,San Jose,Office Supplies,OFF-BI-10002082,133.120,49.9200,Wrong Items
294,US-2021-151127,2021-05-22,Rob Lucas,Los Angeles,Office Supplies,OFF-AR-10002445,49.560,18.8328,Wrong Items


##### Left join
This returns all rows from df1 and matched rows from df2. Non-matched rows in df2 will have NaN.

In [0]:
left_df = pd.merge(df1, df2, how='left', on='order_id')
left_df

Unnamed: 0,order_id,order_date,customer_name,city,category,product_id,sales,profit,return_reason
0,CA-2018-100006,2018-09-07,Dennis Kane,New York City,Technology,TEC-PH-10002075,377.970,109.6113,
1,CA-2018-100090,2018-07-08,Ed Braxton,San Francisco,Furniture,FUR-TA-10003715,502.488,-87.9354,
2,CA-2018-100293,2018-03-14,Neil Französisch,Jacksonville,Office Supplies,OFF-PA-10000176,91.056,31.8696,
3,CA-2018-100328,2018-01-28,Jasper Cacioppo,New York City,Office Supplies,OFF-BI-10000343,3.928,1.3257,
4,CA-2018-100363,2018-04-08,Jim Mitchum,Glendale,Office Supplies,OFF-FA-10000611,2.368,0.8288,
...,...,...,...,...,...,...,...,...,...
5004,US-2021-168802,2021-11-03,Jack O'Briant,Seattle,Office Supplies,OFF-BI-10002393,18.368,5.9696,
5005,US-2021-169320,2021-07-23,Lena Hernandez,Elkhart,Office Supplies,OFF-AR-10003602,11.680,5.4896,
5006,US-2021-169488,2021-09-07,Allen Armold,Providence,Office Supplies,OFF-PA-10000157,39.960,18.7812,
5007,US-2021-169502,2021-08-28,Matthew Grinstein,Milwaukee,Office Supplies,OFF-AP-10001947,91.600,26.5640,


##### Right Join
This returns all rows from df2 and matched rows from df1. Non-matched rows in df1 will have NaN.

In [0]:
right_df = pd.merge(df1, df2, how='right', on='order_id')
right_df

Unnamed: 0,order_id,order_date,customer_name,city,category,product_id,sales,profit,return_reason
0,CA-2020-104689,2020-12-01,Fred Hopkins,Los Angeles,Office Supplies,OFF-AR-10001149,23.040,6.9120,Wrong Items
1,CA-2020-105081,2020-12-25,Joe Elijah,Seattle,Furniture,FUR-CH-10000847,698.352,52.3764,Wrong Items
2,CA-2020-105291,2020-10-30,Susan Pistek,San Luis Obispo,Office Supplies,OFF-FA-10003059,3.620,1.1946,Wrong Items
3,CA-2020-105585,2020-08-26,Roland Fjeld,San Jose,Office Supplies,OFF-FA-10002983,10.230,4.9104,Wrong Items
4,CA-2020-106950,2020-09-02,Joe Elijah,Charlotte,Furniture,FUR-TA-10001768,472.518,-149.6307,Wrong Items
...,...,...,...,...,...,...,...,...,...
291,US-2021-136679,2021-11-14,Xylona Preis,Pasadena,Office Supplies,OFF-AR-10003582,45.040,4.5040,Others
292,US-2021-147886,2021-03-28,Dave Hallsten,Fairfield,Furniture,FUR-FU-10001095,26.480,10.0624,Others
293,US-2021-147998,2021-05-19,Sue Ann Reed,San Jose,Office Supplies,OFF-BI-10002082,133.120,49.9200,Wrong Items
294,US-2021-151127,2021-05-22,Rob Lucas,Los Angeles,Office Supplies,OFF-AR-10002445,49.560,18.8328,Wrong Items


##### Outer Join
This returns all rows from both DataFrames. Non-matched rows in both DataFrames will have NaN.

In [0]:
outer_df = pd.merge(df1, df2, how='outer', on='order_id')
outer_df

Unnamed: 0,order_id,order_date,customer_name,city,category,product_id,sales,profit,return_reason
0,CA-2018-100006,2018-09-07,Dennis Kane,New York City,Technology,TEC-PH-10002075,377.970,109.6113,
1,CA-2018-100090,2018-07-08,Ed Braxton,San Francisco,Furniture,FUR-TA-10003715,502.488,-87.9354,
2,CA-2018-100293,2018-03-14,Neil Französisch,Jacksonville,Office Supplies,OFF-PA-10000176,91.056,31.8696,
3,CA-2018-100328,2018-01-28,Jasper Cacioppo,New York City,Office Supplies,OFF-BI-10000343,3.928,1.3257,
4,CA-2018-100363,2018-04-08,Jim Mitchum,Glendale,Office Supplies,OFF-FA-10000611,2.368,0.8288,
...,...,...,...,...,...,...,...,...,...
5004,US-2021-168802,2021-11-03,Jack O'Briant,Seattle,Office Supplies,OFF-BI-10002393,18.368,5.9696,
5005,US-2021-169320,2021-07-23,Lena Hernandez,Elkhart,Office Supplies,OFF-AR-10003602,11.680,5.4896,
5006,US-2021-169488,2021-09-07,Allen Armold,Providence,Office Supplies,OFF-PA-10000157,39.960,18.7812,
5007,US-2021-169502,2021-08-28,Matthew Grinstein,Milwaukee,Office Supplies,OFF-AP-10001947,91.600,26.5640,


##### Cross join
for each record in the first dataframe, will have all the records from the second dataframe

In [0]:
cross_df = pd.merge(df1, df2, how='cross')
cross_df

Unnamed: 0,order_id_x,order_date,customer_name,city,category,product_id,sales,profit,order_id_y,return_reason
0,CA-2018-100006,2018-09-07,Dennis Kane,New York City,Technology,TEC-PH-10002075,377.97,109.6113,CA-2020-104689,Wrong Items
1,CA-2018-100006,2018-09-07,Dennis Kane,New York City,Technology,TEC-PH-10002075,377.97,109.6113,CA-2020-105081,Wrong Items
2,CA-2018-100006,2018-09-07,Dennis Kane,New York City,Technology,TEC-PH-10002075,377.97,109.6113,CA-2020-105291,Wrong Items
3,CA-2018-100006,2018-09-07,Dennis Kane,New York City,Technology,TEC-PH-10002075,377.97,109.6113,CA-2020-105585,Wrong Items
4,CA-2018-100006,2018-09-07,Dennis Kane,New York City,Technology,TEC-PH-10002075,377.97,109.6113,CA-2020-106950,Wrong Items
...,...,...,...,...,...,...,...,...,...,...
1482659,US-2021-169551,2021-07-07,Rob Lucas,Philadelphia,Furniture,FUR-BO-10001519,87.21,-45.3492,US-2021-136679,Others
1482660,US-2021-169551,2021-07-07,Rob Lucas,Philadelphia,Furniture,FUR-BO-10001519,87.21,-45.3492,US-2021-147886,Others
1482661,US-2021-169551,2021-07-07,Rob Lucas,Philadelphia,Furniture,FUR-BO-10001519,87.21,-45.3492,US-2021-147998,Wrong Items
1482662,US-2021-169551,2021-07-07,Rob Lucas,Philadelphia,Furniture,FUR-BO-10001519,87.21,-45.3492,US-2021-151127,Wrong Items


##### Joining on Multiple Columns
If you want to join on multiple columns, you can pass a list to the on parameter.

In [0]:
# result = pd.merge(df1, df2, how='inner', on=['column1', 'column2'])

In [0]:
# We  dont have any common value column in both the dataframes other than order_id
# pd.merge(left = df , right = dfr ,how = 'inner' ,  left_on=['column1', 'column2'] , right_on = ['column1', 'column2'])