# Pandas examples

In [None]:
!pip install pandas

## 1. Data Structures in Pandas

In [1]:
import pandas as pd

# Create a Pandas Series
data_series = pd.Series([10, 20, 30, 40])
print("Pandas Series:\n", data_series)

# Create a Pandas DataFrame
data_frame = pd.DataFrame({
    'Product': ['A', 'B', 'C', 'D'],
    'Sales': [1500, 2000, 2500, 3000],
    'Quantity': [10, 15, 20, 25]
})
print("\nPandas DataFrame:\n", data_frame)


Pandas Series:
 0    10
1    20
2    30
3    40
dtype: int64

Pandas DataFrame:
   Product  Sales  Quantity
0       A   1500        10
1       B   2000        15
2       C   2500        20
3       D   3000        25


**Creating a Pandas Series**

In [3]:
import pandas as pd

# Creating a Series from a list
data_list = [10, 20, 30, 40]
series_from_list = pd.Series(data_list)
print("Series from list:\n", series_from_list)

# Creating a Series from a NumPy array
import numpy as np
data_array = np.array([1, 2, 3, 4])
series_from_array = pd.Series(data_array)
print("\nSeries from NumPy array:\n", series_from_array)

# Creating a Series from a dictionary
data_dict = {'a': 1, 'b': 2, 'c': 3}
series_from_dict = pd.Series(data_dict)
print("\nSeries from dictionary:\n", series_from_dict)

# Creating a Series with a custom index
custom_index_series = pd.Series(data_list, index=['A', 'B', 'C', 'D'])
print("\nSeries with custom index:\n", custom_index_series)


Series from list:
 0    10
1    20
2    30
3    40
dtype: int64

Series from NumPy array:
 0    1
1    2
2    3
3    4
dtype: int64

Series from dictionary:
 a    1
b    2
c    3
dtype: int64

Series with custom index:
 A    10
B    20
C    30
D    40
dtype: int64


**Creating a Pandas DataFrame**

In [5]:
# Creating a DataFrame from a dictionary
data = {
    'Product': ['A', 'B', 'C', 'D'],
    'Sales': [1500, 2000, 2500, 3000],
    'Quantity': [10, 15, 20, 25]
}
df_from_dict = pd.DataFrame(data)
print("\nDataFrame from dictionary:\n", df_from_dict)

# Creating a DataFrame from a list of dictionaries
data_list_of_dicts = [
    {'Product': 'A', 'Sales': 1500, 'Quantity': 10},
    {'Product': 'B', 'Sales': 2000, 'Quantity': 15},
    {'Product': 'C', 'Sales': 2500, 'Quantity': 20},
    {'Product': 'D', 'Sales': 3000, 'Quantity': 25}
]
df_from_list_of_dicts = pd.DataFrame(data_list_of_dicts)
print("\nDataFrame from list of dictionaries:\n", df_from_list_of_dicts)

# Creating a DataFrame from a NumPy array
data_array_2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
df_from_array = pd.DataFrame(data_array_2d, columns=['A', 'B', 'C'], index=['Row1', 'Row2', 'Row3'])
print("\nDataFrame from NumPy array:\n", df_from_array)

# Creating a DataFrame with a custom index and column order
df_custom_index_columns = pd.DataFrame(data, index=['Row1', 'Row2', 'Row3', 'Row4'], columns=['Product', 'Sales'])
print("\nDataFrame with custom index and column order:\n", df_custom_index_columns)




DataFrame from dictionary:
   Product  Sales  Quantity
0       A   1500        10
1       B   2000        15
2       C   2500        20
3       D   3000        25

DataFrame from list of dictionaries:
   Product  Sales  Quantity
0       A   1500        10
1       B   2000        15
2       C   2500        20
3       D   3000        25

DataFrame from NumPy array:
       A  B  C
Row1  1  2  3
Row2  4  5  6
Row3  7  8  9

DataFrame with custom index and column order:
      Product  Sales
Row1       A   1500
Row2       B   2000
Row3       C   2500
Row4       D   3000


## 2. Importing and exporting data

In [None]:
!pip install openpyxl

In [8]:
# Reading data from CSV
df_csv = pd.read_csv('sales_data.csv')  # Example file
print("\nDataFrame from CSV:\n", df_csv)

# Writing DataFrame to Excel
df_csv.to_excel('sales_data.xlsx', index=False)

# Reading from JSON
df_json = pd.read_json('sales_data.json')  # Example file
print("\nDataFrame from JSON:\n", df_json)



DataFrame from CSV:
   Product  Sales  Quantity
0       A   1500        10
1       B   2000        15
2       C   2500        20
3       D   3000        25
4       E   3500        30



DataFrame from JSON:
   Product  Sales  Quantity
0       A   1500        10
1       B   2000        15
2       C   2500        20
3       D   3000        25
4       E   3500        30


## 3. Data manipulation

**Selection and Filtering**

Selecting Columns


In [10]:
# Selecting a single column
sales_column = df_csv['Sales']
print("Sales Column:\n", sales_column)

# Selecting multiple columns
subset_df = df_csv[['Product', 'Sales']]
print("\nSubset DataFrame:\n", subset_df)


Sales Column:
 0    1500
1    2000
2    2500
3    3000
4    3500
Name: Sales, dtype: int64

Subset DataFrame:
   Product  Sales
0       A   1500
1       B   2000
2       C   2500
3       D   3000
4       E   3500


Filtering Rows

In [None]:
# Filtering products with Sales greater than 2000
high_sales = df_csv[df_csv['Sales'] > 2000]
print("\nProducts with Sales greater than 2000:\n", high_sales)


**Data Cleaning**

Handling Missing Values

In [11]:
# Adding a missing value for demonstration
df_csv.loc[2, 'Sales'] = None
print("\nDataFrame with Missing Value:\n", df_csv)

# Dropping rows with missing values
df_cleaned = df_csv.dropna()
print("\nDataFrame after Dropping Missing Values:\n", df_cleaned)

# Filling missing values with a specific value
df_filled = df_csv.fillna(0)
print("\nDataFrame after Filling Missing Values:\n", df_filled)



DataFrame with Missing Value:
   Product   Sales  Quantity
0       A  1500.0        10
1       B  2000.0        15
2       C     NaN        20
3       D  3000.0        25
4       E  3500.0        30

DataFrame after Dropping Missing Values:
   Product   Sales  Quantity
0       A  1500.0        10
1       B  2000.0        15
3       D  3000.0        25
4       E  3500.0        30

DataFrame after Filling Missing Values:
   Product   Sales  Quantity
0       A  1500.0        10
1       B  2000.0        15
2       C     0.0        20
3       D  3000.0        25
4       E  3500.0        30


**Creating New Columns**


Adding a New Column Based on Calculations

In [12]:
# Creating a new column for Total Sales (Sales * Quantity)
df_csv['Total_Sales'] = df_csv['Sales'] * df_csv['Quantity']
print("\nDataFrame with Total Sales Column:\n", df_csv)



DataFrame with Total Sales Column:
   Product   Sales  Quantity  Total_Sales
0       A  1500.0        10      15000.0
1       B  2000.0        15      30000.0
2       C     NaN        20          NaN
3       D  3000.0        25      75000.0
4       E  3500.0        30     105000.0


**Aggregation and Grouping**

Summary Statistics

In [13]:
# Summary statistics for numerical columns
summary_stats = df_csv.describe()
print("\nSummary Statistics:\n", summary_stats)



Summary Statistics:
              Sales   Quantity    Total_Sales
count     4.000000   5.000000       4.000000
mean   2500.000000  20.000000   56250.000000
std     912.870929   7.905694   41306.779105
min    1500.000000  10.000000   15000.000000
25%    1875.000000  15.000000   26250.000000
50%    2500.000000  20.000000   52500.000000
75%    3125.000000  25.000000   82500.000000
max    3500.000000  30.000000  105000.000000


Grouping Data

In [14]:
# Grouping by 'Product' and summing the 'Sales'
grouped_sales = df_csv.groupby('Product')['Sales'].sum().reset_index()
print("\nGrouped Sales by Product:\n", grouped_sales)



Grouped Sales by Product:
   Product   Sales
0       A  1500.0
1       B  2000.0
2       C     0.0
3       D  3000.0
4       E  3500.0


**Sorting Data**

Sorting a DataFrame

In [15]:
# Sorting by Sales in descending order
sorted_df = df_csv.sort_values(by='Sales', ascending=False)
print("\nSorted DataFrame by Sales (Descending):\n", sorted_df)



Sorted DataFrame by Sales (Descending):
   Product   Sales  Quantity  Total_Sales
4       E  3500.0        30     105000.0
3       D  3000.0        25      75000.0
1       B  2000.0        15      30000.0
0       A  1500.0        10      15000.0
2       C     NaN        20          NaN


**Merging DataFrames**

Merging Two DataFrames

In [16]:
# Creating another DataFrame for demonstration
additional_data = pd.DataFrame({
    'Product': ['A', 'B', 'C', 'D', 'E'],
    'Category': ['Electronics', 'Electronics', 'Furniture', 'Furniture', 'Grocery']
})

# Merging on 'Product'
merged_df = pd.merge(df_csv, additional_data, on='Product', how='left')
print("\nMerged DataFrame:\n", merged_df)



Merged DataFrame:
   Product   Sales  Quantity  Total_Sales     Category
0       A  1500.0        10      15000.0  Electronics
1       B  2000.0        15      30000.0  Electronics
2       C     NaN        20          NaN    Furniture
3       D  3000.0        25      75000.0    Furniture
4       E  3500.0        30     105000.0      Grocery


**Resetting the Index**

Resetting the Index of a DataFrame

In [17]:
# Resetting the index after filtering
filtered_df = df_csv[df_csv['Sales'] > 2000]
reset_df = filtered_df.reset_index(drop=True)
print("\nFiltered DataFrame with Reset Index:\n", reset_df)



Filtered DataFrame with Reset Index:
   Product   Sales  Quantity  Total_Sales
0       D  3000.0        25      75000.0
1       E  3500.0        30     105000.0


## 4. Aggregation operations

In [18]:
# Statistical summaries
print("\nStatistical summary of DataFrame:\n", data_frame.describe())

# Grouping by Product and summing Sales
grouped_data = data_frame.groupby('Product')['Sales'].sum()
print("\nGrouped Data (Total Sales by Product):\n", grouped_data)



Statistical summary of DataFrame:
             Sales   Quantity   Total Value
count     4.00000   4.000000      4.000000
mean   2500.00000  17.500000  45000.000000
std     408.24829   6.454972  22730.302828
min    2000.00000  10.000000  25000.000000
25%    2375.00000  13.750000  28750.000000
50%    2500.00000  17.500000  40000.000000
75%    2625.00000  21.250000  56250.000000
max    3000.00000  25.000000  75000.000000

Grouped Data (Total Sales by Product):
 Product
A    2500.0
B    2000.0
C    2500.0
D    3000.0
Name: Sales, dtype: float64
