In [None]:
import pandas as pd
import numpy as np

# Set seed for reproducibility
np.random.seed(42)

# Creating the dataset
data = {
    'Region': np.random.choice(['North', 'South', 'East', 'West'], size=50),
    'Category': np.random.choice(['Electronics', 'Furniture', 'Clothing', 'Toys'], size=50),
    'SubCategory': np.random.choice(['A', 'B', 'C'], size=50),
    'Sales': np.random.randint(100, 1000, size=50),
    'Profit': np.random.randint(-200, 500, size=50),
    'Discount': np.random.uniform(0, 0.5, size=50),
    'CustomerID': np.random.randint(1000, 1100, size=50)
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
df.head(5)


# .idxmax()   .tolist()   .nlargest()

In [27]:
# you have to find customer id with highest sales - overall
row = df['Sales'].idxmax()
df.loc[row , ['CustomerID', 'Sales']]

CustomerID    1020
Sales          999
Name: 33, dtype: object

In [36]:
# Find the customer with the highest sales in each region
rows = df.groupby('Region')['Sales'].idxmax().tolist()        # .tolist() is not use or not gives same result
df.loc[rows , 'CustomerID']

4     1002
15    1098
33    1020
24    1034
Name: CustomerID, dtype: int32

In [None]:
# Highest profit per customer in each category and sub-category

In [48]:
rows = df.groupby(['Category' , 'SubCategory'])['Profit'].idxmax().tolist()
df.loc[rows]

Unnamed: 0,Region,Category,SubCategory,Sales,Profit,Discount,CustomerID
28,West,Clothing,A,963,476,0.404751,1099
4,East,Clothing,B,971,445,0.016525,1002
34,South,Clothing,C,833,471,0.418855,1047
13,East,Electronics,A,317,272,0.184827,1037
49,South,Electronics,B,895,424,0.094354,1053
32,West,Electronics,C,195,463,0.198786,1013
20,South,Furniture,A,306,432,0.399173,1017
1,West,Furniture,B,904,481,0.133391,1018
48,South,Furniture,C,185,412,0.441318,1050
38,North,Toys,A,848,491,0.270724,1066


In [49]:
# nlargest
df.nlargest(3, 'Profit')   # gives to 3 row who have maximum profit

Unnamed: 0,Region,Category,SubCategory,Sales,Profit,Discount,CustomerID
38,North,Toys,A,848,491,0.270724,1066
31,North,Toys,C,663,483,0.470262,1032
1,West,Furniture,B,904,481,0.133391,1018


In [60]:
# For each region find top 3 customers by sales
def largest(ds):
    return ds.nlargest(3 ,'Sales')[['CustomerID' , 'Sales']]
df.groupby('Region').apply(largest).reset_index().drop(['level_1'] , axis=1)   # .drop(['level_1'] , axis=1)  will drop the unecessory column that created

Unnamed: 0,Region,CustomerID,Sales
0,East,1002,971
1,East,1071,868
2,East,1098,827
3,North,1098,866
4,North,1066,848
5,North,1053,838
6,South,1020,999
7,South,1065,957
8,South,1053,895
9,West,1034,991


# String functions pandas

In [62]:
import pandas as pd
import numpy as np

# Creating a sample dataset with text columns
np.random.seed(42)

data = {
    'CustomerName': np.random.choice(['Alice Johnson', 'Bob Smith', 'Charlie Brown', 'David Lee', 'Emma Wilson'], size=50),
    'Email': np.random.choice(['alice@gmail.com', 'bob@yahoo.com', 'charlie@outlook.com', 'david@company.com', 'emma@business.net'], size=50),
    'Product': np.random.choice(['Laptop', 'Smartphone', 'Tablet', 'Smartwatch'], size=50),
    'OrderID': np.random.randint(10000, 99999, size=50).astype(str),
    'City': np.random.choice(['New York', 'Los Angeles', 'Chicago', 'Houston', 'San Francisco'], size=50),
    'State': np.random.choice(['NY', 'CA', 'IL', 'TX', 'CA'], size=50),
    'PhoneNumber': np.random.choice(['+1-212-555-0192', '+1-310-555-2048', '+1-312-555-3412', '+1-713-555-7289', '+1-415-555-6192'], size=50)
}

df = pd.DataFrame(data)

# Display the dataset
df.head(6)


Unnamed: 0,CustomerName,Email,Product,OrderID,City,State,PhoneNumber
0,David Lee,david@company.com,Smartwatch,90356,Los Angeles,NY,+1-312-555-3412
1,Emma Wilson,david@company.com,Tablet,81910,Chicago,CA,+1-415-555-6192
2,Charlie Brown,david@company.com,Laptop,66044,New York,CA,+1-312-555-3412
3,Emma Wilson,emma@business.net,Smartwatch,77214,San Francisco,IL,+1-312-555-3412
4,Emma Wilson,charlie@outlook.com,Smartwatch,43827,New York,TX,+1-312-555-3412
5,Bob Smith,alice@gmail.com,Smartphone,65820,New York,CA,+1-310-555-2048


In [69]:
# Question - 
# Extract the name from email
df['Email'].str.split('@').str[0]

0       david
1       david
2       david
3        emma
4     charlie
5       alice
6       david
7         bob
8       david
9         bob
10        bob
11      david
12       emma
13        bob
14        bob
15      david
16        bob
17        bob
18      david
19      david
20      alice
21       emma
22       emma
23        bob
24       emma
25        bob
26      alice
27      david
28      david
29      david
30       emma
31      alice
32       emma
33       emma
34      alice
35      alice
36      alice
37      alice
38      david
39    charlie
40    charlie
41      alice
42    charlie
43    charlie
44      alice
45    charlie
46       emma
47        bob
48        bob
49      alice
Name: Email, dtype: object

In [70]:
x = 'alice@gmail.com'
x.split('@')[0]

'alice'

In [85]:
# FInd the company name from email id
email = 'prajwalwadhai55@gmail.com'
email.split('@')[1].split('.')[0]

df['Email'].str.split('@').str[1].str.split('.').str[0]

0      company
1      company
2      company
3     business
4      outlook
5        gmail
6      company
7        yahoo
8      company
9        yahoo
10       yahoo
11     company
12    business
13       yahoo
14       yahoo
15     company
16       yahoo
17       yahoo
18     company
19     company
20       gmail
21    business
22    business
23       yahoo
24    business
25       yahoo
26       gmail
27     company
28     company
29     company
30    business
31       gmail
32    business
33    business
34       gmail
35       gmail
36       gmail
37       gmail
38     company
39     outlook
40     outlook
41       gmail
42     outlook
43     outlook
44       gmail
45     outlook
46    business
47       yahoo
48       yahoo
49       gmail
Name: Email, dtype: object

In [93]:
# Extract the name from Email in upper case
df['Email'].str.split('@').str[0].str.upper()

0       DAVID
1       DAVID
2       DAVID
3        EMMA
4     CHARLIE
5       ALICE
6       DAVID
7         BOB
8       DAVID
9         BOB
10        BOB
11      DAVID
12       EMMA
13        BOB
14        BOB
15      DAVID
16        BOB
17        BOB
18      DAVID
19      DAVID
20      ALICE
21       EMMA
22       EMMA
23        BOB
24       EMMA
25        BOB
26      ALICE
27      DAVID
28      DAVID
29      DAVID
30       EMMA
31      ALICE
32       EMMA
33       EMMA
34      ALICE
35      ALICE
36      ALICE
37      ALICE
38      DAVID
39    CHARLIE
40    CHARLIE
41      ALICE
42    CHARLIE
43    CHARLIE
44      ALICE
45    CHARLIE
46       EMMA
47        BOB
48        BOB
49      ALICE
Name: Email, dtype: object

In [104]:
# Check if the first name from the CustomerName and Email are same or not
x = df['CustomerName'].str.split(' ').str[0].str.lower()
y = df['Email'].str.split('@').str[0]
(x == y).sum()      # .sum() is use to find how many people have same name in emailid as well

10

In [105]:
import pandas as pd
import numpy as np

# Creating a sample dataset
np.random.seed(42)

data = {
    'Region': np.random.choice(['North', 'South', 'East', 'West'], size=50),
    'Category': np.random.choice(['Electronics', 'Furniture', 'Clothing', 'Toys'], size=50),
    'Sales': np.random.randint(100, 1000, size=50),
    'Profit': np.random.randint(-200, 500, size=50),
    'Discount': np.random.uniform(0, 0.5, size=50)
}

df = pd.DataFrame(data)

# Display the dataset
df


Unnamed: 0,Region,Category,Sales,Profit,Discount
0,East,Clothing,802,289,0.093285
1,West,Furniture,501,30,0.446279
2,North,Clothing,829,-160,0.269671
3,East,Toys,655,-173,0.40372
4,East,Clothing,261,-66,0.448046
5,West,Toys,301,0,0.159002
6,North,Toys,369,-168,0.055026
7,North,Electronics,962,-153,0.113968
8,East,Clothing,915,302,0.213554
9,South,Electronics,370,206,0.409007
