In [1]:
import pandas as pd
import numpy as np

# Set seed for reproducibility
np.random.seed(42)

# Creating the dataset
data = {
    'Region': np.random.choice(['North', 'South', 'East', 'West'], size=50),
    'Category': np.random.choice(['Electronics', 'Furniture', 'Clothing', 'Toys'], size=50),
    'SubCategory': np.random.choice(['A', 'B', 'C'], size=50),
    'Sales': np.random.randint(100, 1000, size=50),
    'Profit': np.random.randint(-200, 500, size=50),
    'Discount': np.random.uniform(0, 0.5, size=50),
    'CustomerID': np.random.randint(1000, 1100, size=50)
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
df.head(5)


Unnamed: 0,Region,Category,SubCategory,Sales,Profit,Discount,CustomerID
0,East,Clothing,C,827,377,0.49887,1098
1,West,Furniture,B,904,481,0.133391,1018
2,North,Clothing,B,198,356,0.488307,1047
3,East,Toys,B,783,373,0.205519,1079
4,East,Clothing,B,971,445,0.016525,1002


# .idxmax()   .tolist()   .nlargest()

In [2]:
# you have to find customer id with highest sales - overall
row = df['Sales'].idxmax()
df.loc[row , ['CustomerID', 'Sales']]

CustomerID    1020
Sales          999
Name: 33, dtype: object

In [3]:
# Find the customer with the highest sales in each region
rows = df.groupby('Region')['Sales'].idxmax().tolist()        # .tolist() is not use or not gives same result
df.loc[rows , 'CustomerID']

4     1002
15    1098
33    1020
24    1034
Name: CustomerID, dtype: int32

In [4]:
# Highest profit per customer in each category and sub-category

In [5]:
rows = df.groupby(['Category' , 'SubCategory'])['Profit'].idxmax().tolist()
df.loc[rows]

Unnamed: 0,Region,Category,SubCategory,Sales,Profit,Discount,CustomerID
28,West,Clothing,A,963,476,0.404751,1099
4,East,Clothing,B,971,445,0.016525,1002
34,South,Clothing,C,833,471,0.418855,1047
13,East,Electronics,A,317,272,0.184827,1037
49,South,Electronics,B,895,424,0.094354,1053
32,West,Electronics,C,195,463,0.198786,1013
20,South,Furniture,A,306,432,0.399173,1017
1,West,Furniture,B,904,481,0.133391,1018
48,South,Furniture,C,185,412,0.441318,1050
38,North,Toys,A,848,491,0.270724,1066


In [6]:
# nlargest
df.nlargest(3, 'Profit')   # gives to 3 row who have maximum profit

Unnamed: 0,Region,Category,SubCategory,Sales,Profit,Discount,CustomerID
38,North,Toys,A,848,491,0.270724,1066
31,North,Toys,C,663,483,0.470262,1032
1,West,Furniture,B,904,481,0.133391,1018


In [7]:
# For each region find top 3 customers by sales
def largest(ds):
    return ds.nlargest(3 ,'Sales')[['CustomerID' , 'Sales']]
df.groupby('Region').apply(largest).reset_index().drop(['level_1'] , axis=1)   # .drop(['level_1'] , axis=1)  will drop the unecessory column that created

Unnamed: 0,Region,CustomerID,Sales
0,East,1002,971
1,East,1071,868
2,East,1098,827
3,North,1098,866
4,North,1066,848
5,North,1053,838
6,South,1020,999
7,South,1065,957
8,South,1053,895
9,West,1034,991


# String functions pandas

In [8]:
import pandas as pd
import numpy as np

# Creating a sample dataset with text columns
np.random.seed(42)

data = {
    'CustomerName': np.random.choice(['Alice Johnson', 'Bob Smith', 'Charlie Brown', 'David Lee', 'Emma Wilson'], size=50),
    'Email': np.random.choice(['alice@gmail.com', 'bob@yahoo.com', 'charlie@outlook.com', 'david@company.com', 'emma@business.net'], size=50),
    'Product': np.random.choice(['Laptop', 'Smartphone', 'Tablet', 'Smartwatch'], size=50),
    'OrderID': np.random.randint(10000, 99999, size=50).astype(str),
    'City': np.random.choice(['New York', 'Los Angeles', 'Chicago', 'Houston', 'San Francisco'], size=50),
    'State': np.random.choice(['NY', 'CA', 'IL', 'TX', 'CA'], size=50),
    'PhoneNumber': np.random.choice(['+1-212-555-0192', '+1-310-555-2048', '+1-312-555-3412', '+1-713-555-7289', '+1-415-555-6192'], size=50)
}

df = pd.DataFrame(data)

# Display the dataset
df.head(6)


Unnamed: 0,CustomerName,Email,Product,OrderID,City,State,PhoneNumber
0,David Lee,david@company.com,Smartwatch,90356,Los Angeles,NY,+1-312-555-3412
1,Emma Wilson,david@company.com,Tablet,81910,Chicago,CA,+1-415-555-6192
2,Charlie Brown,david@company.com,Laptop,66044,New York,CA,+1-312-555-3412
3,Emma Wilson,emma@business.net,Smartwatch,77214,San Francisco,IL,+1-312-555-3412
4,Emma Wilson,charlie@outlook.com,Smartwatch,43827,New York,TX,+1-312-555-3412
5,Bob Smith,alice@gmail.com,Smartphone,65820,New York,CA,+1-310-555-2048


In [9]:
# Question - 
# Extract the name from email
df['Email'].str.split('@').str[0]

0       david
1       david
2       david
3        emma
4     charlie
5       alice
6       david
7         bob
8       david
9         bob
10        bob
11      david
12       emma
13        bob
14        bob
15      david
16        bob
17        bob
18      david
19      david
20      alice
21       emma
22       emma
23        bob
24       emma
25        bob
26      alice
27      david
28      david
29      david
30       emma
31      alice
32       emma
33       emma
34      alice
35      alice
36      alice
37      alice
38      david
39    charlie
40    charlie
41      alice
42    charlie
43    charlie
44      alice
45    charlie
46       emma
47        bob
48        bob
49      alice
Name: Email, dtype: object

In [10]:
x = 'alice@gmail.com'
x.split('@')[0]

'alice'

In [11]:
# FInd the company name from email id
email = 'prajwalwadhai55@gmail.com'
email.split('@')[1].split('.')[0]

df['Email'].str.split('@').str[1].str.split('.').str[0]

0      company
1      company
2      company
3     business
4      outlook
5        gmail
6      company
7        yahoo
8      company
9        yahoo
10       yahoo
11     company
12    business
13       yahoo
14       yahoo
15     company
16       yahoo
17       yahoo
18     company
19     company
20       gmail
21    business
22    business
23       yahoo
24    business
25       yahoo
26       gmail
27     company
28     company
29     company
30    business
31       gmail
32    business
33    business
34       gmail
35       gmail
36       gmail
37       gmail
38     company
39     outlook
40     outlook
41       gmail
42     outlook
43     outlook
44       gmail
45     outlook
46    business
47       yahoo
48       yahoo
49       gmail
Name: Email, dtype: object

In [12]:
# Extract the name from Email in upper case
df['Email'].str.split('@').str[0].str.upper()

0       DAVID
1       DAVID
2       DAVID
3        EMMA
4     CHARLIE
5       ALICE
6       DAVID
7         BOB
8       DAVID
9         BOB
10        BOB
11      DAVID
12       EMMA
13        BOB
14        BOB
15      DAVID
16        BOB
17        BOB
18      DAVID
19      DAVID
20      ALICE
21       EMMA
22       EMMA
23        BOB
24       EMMA
25        BOB
26      ALICE
27      DAVID
28      DAVID
29      DAVID
30       EMMA
31      ALICE
32       EMMA
33       EMMA
34      ALICE
35      ALICE
36      ALICE
37      ALICE
38      DAVID
39    CHARLIE
40    CHARLIE
41      ALICE
42    CHARLIE
43    CHARLIE
44      ALICE
45    CHARLIE
46       EMMA
47        BOB
48        BOB
49      ALICE
Name: Email, dtype: object

In [13]:
# Check if the first name from the CustomerName and Email are same or not
x = df['CustomerName'].str.split(' ').str[0].str.lower()
y = df['Email'].str.split('@').str[0]
(x == y).sum()      # .sum() is use to find how many people have same name in emailid as well

10

# Pivot :
## pd.pivot_table(df, index = 'Region' , columns = 'Category' , values = 'Sales' , aggfunc = 'sum')

In [14]:
import pandas as pd
import numpy as np

# Creating a sample dataset
np.random.seed(42)

data = {
    'Region': np.random.choice(['North', 'South', 'East', 'West'], size=50),
    'Category': np.random.choice(['Electronics', 'Furniture', 'Clothing', 'Toys'], size=50),
    'Sales': np.random.randint(100, 1000, size=50),
    'Profit': np.random.randint(-200, 500, size=50),
    'Discount': np.random.uniform(0, 0.5, size=50)
}

df = pd.DataFrame(data)

# Display the dataset
df


Unnamed: 0,Region,Category,Sales,Profit,Discount
0,East,Clothing,802,289,0.093285
1,West,Furniture,501,30,0.446279
2,North,Clothing,829,-160,0.269671
3,East,Toys,655,-173,0.40372
4,East,Clothing,261,-66,0.448046
5,West,Toys,301,0,0.159002
6,North,Toys,369,-168,0.055026
7,North,Electronics,962,-153,0.113968
8,East,Clothing,915,302,0.213554
9,South,Electronics,370,206,0.409007


In [15]:
df.groupby(['Region' , 'Category'])['Sales'].sum()  # this is the normal way without using pivot

Region  Category   
East    Clothing       3094
        Electronics    1177
        Furniture      1008
        Toys           1443
North   Clothing       1563
        Electronics    1190
        Furniture       547
        Toys           2028
South   Clothing        938
        Electronics    2133
        Furniture      2341
        Toys           2027
West    Clothing        941
        Electronics    1846
        Furniture      3735
        Toys           2229
Name: Sales, dtype: int32

In [16]:
pd.pivot_table(df, index = 'Region' , columns = 'Category' , values = 'Sales' , aggfunc = 'sum')

Category,Clothing,Electronics,Furniture,Toys
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
East,3094,1177,1008,1443
North,1563,1190,547,2028
South,938,2133,2341,2027
West,941,1846,3735,2229


In [17]:
df['Sales'].sum()

28240

In [18]:
# % of grand total
pivot_sales = pd.pivot_table(df, index = 'Region', columns = 'Category', values = 'Sales', aggfunc = 'sum')
pivot_sales/df['Sales'].sum()            # each cell will be divided by this number

Category,Clothing,Electronics,Furniture,Toys
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
East,0.109561,0.041678,0.035694,0.051098
North,0.055347,0.042139,0.01937,0.071813
South,0.033215,0.075531,0.082897,0.071778
West,0.033322,0.065368,0.132259,0.078931


In [19]:
# % of row total
pivot_sales = pd.pivot_table(df, index = 'Region', columns = 'Category', values = 'Sales', aggfunc = 'sum')
pivot_sales.div(pivot_sales.sum(axis = 1), axis = 0)
pivot_sales.sum(axis = 1)

Region
East     6722
North    5328
South    7439
West     8751
dtype: int64

In [20]:
# % of column total
pivot_sales = pd.pivot_table(df, index = 'Region', columns = 'Category', values = 'Sales', aggfunc = 'sum')
pivot_sales.div(pivot_sales.sum(axis = 0), axis = 1)

Category,Clothing,Electronics,Furniture,Toys
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
East,0.473378,0.185471,0.132093,0.186748
North,0.239137,0.18752,0.071681,0.262456
South,0.143513,0.336117,0.306775,0.262327
West,0.143972,0.290892,0.489451,0.288469


In [22]:
dff = pd.DataFrame({'A':[10,20,30,40], 'B': [30, 60, 20, 50]})
dff

Unnamed: 0,A,B
0,10,30
1,20,60
2,30,20
3,40,50


In [23]:
# Divide dff row wise with series 
ds = pd.Series([2,3,4,5])
dff.div(ds, axis = 0)

Unnamed: 0,A,B
0,5.0,15.0
1,6.666667,20.0
2,7.5,5.0
3,8.0,10.0


In [24]:
# Divide dff row wise with Series - 
ds = pd.Series([2,3,4])
dff.div(ds, axis = 0)

Unnamed: 0,A,B
0,5.0,15.0
1,6.666667,20.0
2,7.5,5.0
3,,


In [25]:
# Divide dff row wise with list
ds = [2,3,4]
dff.div(ds, axis = 0)

# Lenghts should be same

ValueError: Unable to coerce to Series, length must be 4: given 3

In [26]:
# Divide column wise with Data Series
ds = pd.Series([2,4])
ds
dff.div(ds, axis = 1)
# Indexes are not same

Unnamed: 0,A,B,0,1
0,,,,
1,,,,
2,,,,
3,,,,


In [27]:
# Divide column wise with Data Series
ds = pd.Series([2,4], index = ['A', 'B'])
ds
dff.div(ds, axis = 1)


Unnamed: 0,A,B
0,5.0,7.5
1,10.0,15.0
2,15.0,5.0
3,20.0,12.5


In [28]:
# Divide column wise with list
ds = [2,4]        # it have the same length 
ds
dff.div(ds, axis = 1)     


Unnamed: 0,A,B
0,5.0,7.5
1,10.0,15.0
2,15.0,5.0
3,20.0,12.5


In [29]:
import pandas as pd
import numpy as np

# Set seed for reproducibility
np.random.seed(42)

# Creating the dataset
data = {
    'Region': np.random.choice(['North', 'South', 'East', 'West'], size=50),
    'Category': np.random.choice(['Electronics', 'Furniture', 'Clothing', 'Toys'], size=50),
    'SubCategory': np.random.choice(['A', 'B', 'C'], size=50),
    'Sales': np.random.randint(100, 1000, size=50),
    'Profit': np.random.randint(-200, 500, size=50),
    'Discount': np.random.uniform(0, 0.5, size=50),
    'CustomerID': np.random.randint(1000, 1100, size=50)
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
df


Unnamed: 0,Region,Category,SubCategory,Sales,Profit,Discount,CustomerID
0,East,Clothing,C,827,377,0.49887,1098
1,West,Furniture,B,904,481,0.133391,1018
2,North,Clothing,B,198,356,0.488307,1047
3,East,Toys,B,783,373,0.205519,1079
4,East,Clothing,B,971,445,0.016525,1002
5,West,Toys,B,825,-173,0.172536,1019
6,North,Toys,B,646,419,0.317176,1023
7,North,Electronics,C,838,355,0.340353,1053
8,East,Clothing,C,712,139,0.265467,1032
9,South,Electronics,B,561,130,0.223892,1023


In [30]:
# Find total sales by region vs Category
pd.pivot_table(df, index = 'Region' , columns = 'Category' , values = 'Sales' , aggfunc = 'sum')

Category,Clothing,Electronics,Furniture,Toys
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
East,4120,421,1246,2035
North,1122,1168,980,2157
South,833,2413,1186,2463
West,1565,2156,4682,3058


In [35]:
# row wise %
pivot_table = df.pivot_table(values='Sales', index='Region', columns='Category', aggfunc='sum')
row_wise_sum = pivot_table.sum(axis = 0)
pivot_table.div(row_wise_sum, axis = 1)

Category,Clothing,Electronics,Furniture,Toys
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
East,0.539267,0.068366,0.153941,0.209513
North,0.146859,0.189672,0.121077,0.222074
South,0.109031,0.391848,0.146528,0.253578
West,0.204843,0.350114,0.578453,0.314836


In [41]:
# Try column wise on your own
pivot_table = df.pivot_table(values='Sales', index='Region', columns='Category', aggfunc='sum')
row_wise_sum = pivot_table.sum(axis = 1)
pivot_table.div(row_wise_sum, axis = 0)

Category,Clothing,Electronics,Furniture,Toys
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
East,0.52672,0.053823,0.159294,0.260164
North,0.206744,0.21522,0.180579,0.397457
South,0.120812,0.349964,0.172009,0.357215
West,0.13655,0.188116,0.408516,0.266818


In [46]:
S = df['CustomerID'].value_counts()

In [48]:
S[S>1]

CustomerID
1032    4
1098    3
1047    3
1053    3
1079    2
1019    2
1023    2
1037    2
Name: count, dtype: int64

In [52]:
df.loc[23, 'CustomerID']  = 1006

In [53]:
# Find duplicate customer id
S = df['CustomerID'].value_counts()
S[S>1]

CustomerID
1032    4
1098    3
1047    3
1079    2
1006    2
1019    2
1023    2
1053    2
1037    2
Name: count, dtype: int64

In [54]:
lolo = df['CustomerID'].duplicated(keep = 'first')
df.loc[lolo, 'CustomerID']


9     1023
15    1098
17    1098
25    1079
29    1032
31    1032
34    1047
35    1019
37    1006
40    1032
41    1047
47    1037
49    1053
Name: CustomerID, dtype: int32

In [55]:
# Many functions are there in pandas
# you can't all functions
# But if you're good with fundamentals - indexing , apply, groupby , groupby apply

# Eventually you'll be doing one the following things:
    # Output of the function as index
    # Indexing on top of output of the function
    # Apply that function using .apply
    # Apply that function within a group using agg, or apply
    # apply another function on output of that function
    

    
    
# Indexing - Indexing with Series, Boolean INdexing with Series
            # - Indexing with DF, Boolean Indexing with DataFrame
# Arithmatic operations - Between Series and Series, Series and lists, Df & Series, Dfs & lists
# Over writing - How to write on a Series or subset of a Series




# apply functions - 
 # Apply with Data Series, Apply with Dataframes

# Groupby 
    # Groupby with agg, groupby with apply, transform

