## Pandas

### data manipulation library, data analysis and cleanig using Series and DataFrame


In [54]:
import pandas as pd

## Creating Series

### 1D array like object


#### From List


In [55]:
data = [1, 2, 3, 4, 5]
series = pd.Series(data)
print(series)

0    1
1    2
2    3
3    4
4    5
dtype: int64


#### From Dict


In [56]:
data = {"a": 1, "b": 2, "c": 3}
series = pd.Series(data)
print(series)

a    1
b    2
c    3
dtype: int64


### Custom index in series


In [57]:
data = [10, 20, 30]
index = ["a", "b", "b"]
series = pd.Series(data, index=index)
print(series)

a    10
b    20
b    30
dtype: int64


## Creating DataFrame

### 2D array like object


### From Dict of Lists


In [58]:
data = {
    "name": ["Alice", "Bob", "Charlie"],
    "age": [25, 30, 35],
    "city": ["New York", "Los Angeles", "Chicago"],
}

df = pd.DataFrame(data)
print(df)

      name  age         city
0    Alice   25     New York
1      Bob   30  Los Angeles
2  Charlie   35      Chicago


### From List of Dicts


In [59]:
data = [
    {"name": "Alice", "age": 25, "city": "New York"},
    {"name": "Bob", "age": 30, "city": "Los Angeles"},
    {"name": "Charlie", "age": 35, "city": "Chicago"},
]

df = pd.DataFrame(data)
print(df)

      name  age         city
0    Alice   25     New York
1      Bob   30  Los Angeles
2  Charlie   35      Chicago


### From CSV


In [60]:
df = pd.read_csv("data.csv")
print(df)

        name  age        city
0  Sudhanshu   24   Hyderabad
1       Ansh   24        Pune
2       Jack   26  Coimbatore


## Indexing DataFrame


In [61]:
df = pd.read_csv("data.csv")

In [62]:
# Accessing columns
print(df["name"])

# Accessing rows
print(df[0:1])

0    Sudhanshu
1         Ansh
2         Jack
Name: name, dtype: object
        name  age       city
0  Sudhanshu   24  Hyderabad


In [63]:
# Accessing rows by index
print(df.iloc[0])
print(df.iloc[:, 0:2])

name    Sudhanshu
age            24
city    Hyderabad
Name: 0, dtype: object
        name  age
0  Sudhanshu   24
1       Ansh   24
2       Jack   26


In [64]:
# Accessing rows by label
print(df.loc[0])
print(df.loc[0:1, "name":"age"])

name    Sudhanshu
age            24
city    Hyderabad
Name: 0, dtype: object
        name  age
0  Sudhanshu   24
1       Ansh   24


In [65]:
# Accessing single value by index
print(df.iat[0, 2])

Hyderabad


In [66]:
# Accessing single value by label
print(df.at[0, "name"])

Sudhanshu


## Data Manipulation

### add new columns, remove columns, update values


In [67]:
df["salary"] = [50000, 60000, 70000]
print(df)

        name  age        city  salary
0  Sudhanshu   24   Hyderabad   50000
1       Ansh   24        Pune   60000
2       Jack   26  Coimbatore   70000


In [68]:
temp_df = df.copy()

In [69]:
temp_df.drop("city", axis=1, inplace=True)
print(temp_df)

        name  age  salary
0  Sudhanshu   24   50000
1       Ansh   24   60000
2       Jack   26   70000


In [70]:
temp_df["age"] = temp_df["age"] + 1
print(temp_df)

        name  age  salary
0  Sudhanshu   25   50000
1       Ansh   25   60000
2       Jack   27   70000


In [71]:
temp_df.drop(0, axis=0, inplace=True)
print(temp_df)

   name  age  salary
1  Ansh   25   60000
2  Jack   27   70000


## DataFrame Attributes


In [72]:
df.head()

Unnamed: 0,name,age,city,salary
0,Sudhanshu,24,Hyderabad,50000
1,Ansh,24,Pune,60000
2,Jack,26,Coimbatore,70000


In [73]:
df.dtypes

name      object
age        int64
city      object
salary     int64
dtype: object

In [74]:
df.describe()

Unnamed: 0,age,salary
count,3.0,3.0
mean,24.666667,60000.0
std,1.154701,10000.0
min,24.0,50000.0
25%,24.0,55000.0
50%,24.0,60000.0
75%,25.0,65000.0
max,26.0,70000.0


## Data Manipulation


In [75]:
df = pd.read_csv("data-2.csv")

In [76]:
df.head(5)

Unnamed: 0,Date,Category,Value,Product,Sales,Region
0,2023-01-01,A,28.0,Product1,754.0,East
1,2023-01-02,B,39.0,Product3,110.0,North
2,2023-01-03,C,32.0,Product2,398.0,East
3,2023-01-04,B,8.0,Product1,522.0,East
4,2023-01-05,B,26.0,Product3,869.0,North


In [77]:
df.tail(5)

Unnamed: 0,Date,Category,Value,Product,Sales,Region
45,2023-02-15,B,99.0,Product2,599.0,West
46,2023-02-16,B,6.0,Product1,938.0,South
47,2023-02-17,B,69.0,Product3,143.0,West
48,2023-02-18,C,65.0,Product3,182.0,North
49,2023-02-19,C,11.0,Product3,708.0,North


In [78]:
df.describe()

Unnamed: 0,Value,Sales
count,47.0,46.0
mean,51.744681,557.130435
std,29.050532,274.598584
min,2.0,108.0
25%,27.5,339.0
50%,54.0,591.5
75%,70.0,767.5
max,99.0,992.0


In [79]:
df.dtypes

Date         object
Category     object
Value       float64
Product      object
Sales       float64
Region       object
dtype: object

### Handling Missing Values


In [80]:
df.isnull().any()

Date        False
Category    False
Value        True
Product     False
Sales        True
Region      False
dtype: bool

In [81]:
df.isnull().sum()

Date        0
Category    0
Value       3
Product     0
Sales       4
Region      0
dtype: int64

## Filling Missing Values


In [82]:
df["Sales_new"] = df["Sales"].fillna(df["Sales"].mean())
df

Unnamed: 0,Date,Category,Value,Product,Sales,Region,Sales_new
0,2023-01-01,A,28.0,Product1,754.0,East,754.0
1,2023-01-02,B,39.0,Product3,110.0,North,110.0
2,2023-01-03,C,32.0,Product2,398.0,East,398.0
3,2023-01-04,B,8.0,Product1,522.0,East,522.0
4,2023-01-05,B,26.0,Product3,869.0,North,869.0
5,2023-01-06,B,54.0,Product3,192.0,West,192.0
6,2023-01-07,A,16.0,Product1,936.0,East,936.0
7,2023-01-08,C,89.0,Product1,488.0,West,488.0
8,2023-01-09,C,37.0,Product3,772.0,West,772.0
9,2023-01-10,A,22.0,Product2,834.0,West,834.0


## Renaming columns


In [83]:
df = df.rename(columns={"Sales Daet": "Sales Date"})

In [84]:
df.dtypes

Date          object
Category      object
Value        float64
Product       object
Sales        float64
Region        object
Sales_new    float64
dtype: object

## Changing data types


In [85]:
df["Value_new"] = df["Value"].fillna(df["Value"].mean()).astype(int)
df.dtypes

Date          object
Category      object
Value        float64
Product       object
Sales        float64
Region        object
Sales_new    float64
Value_new      int64
dtype: object

In [86]:
df.head()

Unnamed: 0,Date,Category,Value,Product,Sales,Region,Sales_new,Value_new
0,2023-01-01,A,28.0,Product1,754.0,East,754.0,28
1,2023-01-02,B,39.0,Product3,110.0,North,110.0,39
2,2023-01-03,C,32.0,Product2,398.0,East,398.0,32
3,2023-01-04,B,8.0,Product1,522.0,East,522.0,8
4,2023-01-05,B,26.0,Product3,869.0,North,869.0,26


In [87]:
df["New value"] = df["Value_new"].apply(lambda x: x * 2)
df.head()

Unnamed: 0,Date,Category,Value,Product,Sales,Region,Sales_new,Value_new,New value
0,2023-01-01,A,28.0,Product1,754.0,East,754.0,28,56
1,2023-01-02,B,39.0,Product3,110.0,North,110.0,39,78
2,2023-01-03,C,32.0,Product2,398.0,East,398.0,32,64
3,2023-01-04,B,8.0,Product1,522.0,East,522.0,8,16
4,2023-01-05,B,26.0,Product3,869.0,North,869.0,26,52


## Data Aggregating and Grouping


In [88]:
df.head()

Unnamed: 0,Date,Category,Value,Product,Sales,Region,Sales_new,Value_new,New value
0,2023-01-01,A,28.0,Product1,754.0,East,754.0,28,56
1,2023-01-02,B,39.0,Product3,110.0,North,110.0,39,78
2,2023-01-03,C,32.0,Product2,398.0,East,398.0,32,64
3,2023-01-04,B,8.0,Product1,522.0,East,522.0,8,16
4,2023-01-05,B,26.0,Product3,869.0,North,869.0,26,52


In [89]:
# Sales mean by Product
grouped_mean = df.groupby("Product")["Sales"].mean()
print(grouped_mean)

Product
Product1    574.866667
Product2    567.230769
Product3    535.055556
Name: Sales, dtype: float64


In [90]:
# Sales sum by Product and Region
grouped_mean_region = df.groupby(["Product", "Region"])["Sales"].sum()
print(grouped_mean_region)

Product   Region
Product1  East      4205.0
          North     1737.0
          South     1346.0
          West      1335.0
Product2  East       856.0
          North      843.0
          South     2240.0
          West      3435.0
Product3  East      1956.0
          North     3428.0
          South     1572.0
          West      2675.0
Name: Sales, dtype: float64


In [91]:
# Aggregate multiple function
grouped_agg = df.groupby(["Product", "Region"])["Sales"].aggregate(["sum", "mean"])
print(grouped_agg)

                    sum        mean
Product  Region                    
Product1 East    4205.0  600.714286
         North   1737.0  868.500000
         South   1346.0  673.000000
         West    1335.0  333.750000
Product2 East     856.0  428.000000
         North    843.0  421.500000
         South   2240.0  746.666667
         West    3435.0  572.500000
Product3 East    1956.0  652.000000
         North   3428.0  571.333333
         South   1572.0  524.000000
         West    2675.0  445.833333


## Mergind and Joining DataFrame


In [92]:
df1_data = { "Product": ["A", "B", "C"], "Price": [10, 20, 30] }
df1 = pd.DataFrame(df1_data);
df2_data = { "Product": ["A", "B", "D"], "Sales": [100, 200, 300] }
df2 = pd.DataFrame(df2_data);

In [93]:
df1


Unnamed: 0,Product,Price
0,A,10
1,B,20
2,C,30


In [94]:
df2


Unnamed: 0,Product,Sales
0,A,100
1,B,200
2,D,300


In [95]:
# Merge on Product Column (similar to SQL JOINS)
# how - inner, left, right, outer
pd.merge(df1, df2, on="Product", how="outer")

Unnamed: 0,Product,Price,Sales
0,A,10.0,100.0
1,B,20.0,200.0
2,C,30.0,
3,D,,300.0


## Reading data from different sources

### From JSON String

In [96]:
from io import StringIO

json_data = '''
[
  {"employee_name": "James", "email": "james@gmail.com", "job_profile": "Team Lead"},
  {"employee_name": "Michael", "email": "michael@gmail.com", "job_profile": "Senior Developer"}
]
'''
df = pd.read_json(StringIO(json_data))
print(df)

  employee_name              email       job_profile
0         James    james@gmail.com         Team Lead
1       Michael  michael@gmail.com  Senior Developer


In [97]:
# JSON of DataFrame
json = df.to_json()
json

'{"employee_name":{"0":"James","1":"Michael"},"email":{"0":"james@gmail.com","1":"michael@gmail.com"},"job_profile":{"0":"Team Lead","1":"Senior Developer"}}'

In [98]:
# String as list of records
json_records = df.to_json(orient="records")
json_records

'[{"employee_name":"James","email":"james@gmail.com","job_profile":"Team Lead"},{"employee_name":"Michael","email":"michael@gmail.com","job_profile":"Senior Developer"}]'

### From CSV - URL

In [99]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
df_csv = pd.read_csv(url, header=None)

In [100]:
df_csv.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [101]:
# Save DataFrame to CSV
df_csv.to_csv("wine_data.csv", index=False)

### From HTML

In [102]:
url_html = 'https://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list/'
df_list = pd.read_html(url_html)
# Number of tables found
print(len(df_list))

# Display first table
df_list[0].head()

1


Unnamed: 0,Bank Name,City,State,Cert,Acquiring Institution,Closing Date,Fund Sort ascending
0,The Santa Anna National Bank,Santa Anna,Texas,5520,Coleman County State Bank,"June 27, 2025",10549
1,Pulaski Savings Bank,Chicago,Illinois,28611,Millennium Bank,"January 17, 2025",10548
2,The First National Bank of Lindsay,Lindsay,Oklahoma,4134,First Bank & Trust Co.,"October 18, 2024",10547
3,Republic First Bank dba Republic Bank,Philadelphia,Pennsylvania,27332,"Fulton Bank, National Association","April 26, 2024",10546
4,Citizens Bank,Sac City,Iowa,8758,Iowa Trust & Savings Bank,"November 3, 2023",10545


In [103]:
url_wiki = 'https://en.wikipedia.org/wiki/Mobile_country_code'

# filter tables containing a specific keyword
df_country = pd.read_html(url_wiki, match='Country', header=0)
df_country[0].head()

Unnamed: 0,Mobile country code,Country,ISO 3166,Mobile network codes,National MNC authority,Remarks
0,289,A Abkhazia,GE-AB,List of mobile network codes in Abkhazia,,MCC is not listed by ITU
1,412,Afghanistan,AF,List of mobile network codes in Afghanistan,,
2,276,Albania,AL,List of mobile network codes in Albania,,
3,603,Algeria,DZ,List of mobile network codes in Algeria,,
4,544,American Samoa (United States of America),AS,List of mobile network codes in American Samoa,,


### From Excel

In [104]:
df_excel = pd.read_excel('data.xlsx', sheet_name=0)
df_excel.head()

Unnamed: 0,Name,Age
0,Krish,32
1,Jack,34
2,John,31


### From Pickle

In [105]:
df_excel.to_pickle("data.pkl")

In [106]:
df_pickle = pd.read_pickle("data.pkl")
df_pickle

Unnamed: 0,Name,Age
0,Krish,32
1,Jack,34
2,John,31


## Many other function
#### read_sql
#### read_sql_query
#### read_table
#### read_xml