## Introduction to Pandas


`Concept`: It helps in data manipulation and analysis.

#### Step 1: Installation and Importing

In [41]:
%pip install pandas




In [1]:
# Importing the Pandas library
import pandas as pd #naming convention

In [4]:
# Creating a simple DataFrame to represent sales data
data = {
    'Product': ['pleAps', 'Oranges', 'Bananas'],
    'Quantity': [30, 20, 15],
    'Price': [0.50, 0.80, 0.30]
}

# _list=[0.50, 0.80, 0.30]
# df1=pd.DataFrame(_list)

# print(df1)
#keuys are columns
#list are col values 

df = pd.DataFrame(data)
print(df)

   Product  Quantity  Price
0   Apples        30    0.5
1  Oranges        20    0.8
2  Bananas        15    0.3


## Step 2: Reading and Writing Data

`Scenario`: Loading and Saving Sales Data

`Concept`: Reading from CSV files, writing to CSV files.

In [5]:
# Reading data from a CSV file
sales_df = pd.read_csv('sales_data.csv') #Directory where your file is kept
print(sales_df)
type(sales_df)

# Writing DataFrame to a CSV file

df.to_csv('new_sales_data.csv', index=False)

          Date     Product  Quantity  Price
0   2024-07-13  Pineapples         7   0.97
1   2024-07-16  Pineapples        22   1.10
2   2024-07-22      Grapes        32   0.81
3   2024-07-01  Pineapples        14   0.78
4   2024-07-04  Pineapples         8   1.92
..         ...         ...       ...    ...
95  2024-07-26      Apples        40   1.02
96  2024-07-11      Apples        36   0.72
97  2024-07-21  Pineapples        31   1.97
98  2024-07-12      Grapes        30   1.22
99  2024-07-05      Grapes        34   1.25

[100 rows x 4 columns]


pandas.core.frame.DataFrame

## Step 3: Data Inspection

`Scenario`: Inspecting the Sales Data for Errors

`Concept`: Understanding data using head, tail, info, and describe methods.

In [22]:
# Display the first few rows of the DataFrame
print(sales_df.head())

# # # # Display the last few rows of the DataFrame
# print(sales_df.tail(15))

# # Get a concise summary of the DataFrame
#print(sales_df.info())

# # # Get descriptive statistics
# print(sales_df.describe())


         Date     Product  Quantity  Price
0  2024-07-13  Pineapples         7   0.97
1  2024-07-16  Pineapples        22   1.10
2  2024-07-22      Grapes        32   0.81
3  2024-07-01  Pineapples        14   0.78
4  2024-07-04  Pineapples         8   1.92


## Step 4: Data Selection and Filtering
`Scenario`: Extracting Specific Sales Information

`Concept`: Selecting rows and columns, filtering data based on conditions

In [18]:
#Selecting a single column
product_column = sales_df['Product']
#print(product_column)
print(type(product_column))

# #Selecting multiple columns
selected_columns = sales_df[['Product', 'Quantity']]
print(type(selected_columns))


# # # Filtering rows based on a condition
filtered_df = sales_df[sales_df['Quantity'] > 20 ] #Give me all the columns and only filtered rows.
print(filtered_df)



<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>


In [25]:
list(sales_df.columns)

['Date', 'Product', 'Quantity', 'Price']

## Slicing The Data Frame

In [23]:
#100 Rows
#4 columns

sales_df.iloc[5:25,1:]

Unnamed: 0,Product,Quantity,Price
5,Pineapples,25,1.61
6,Apples,16,1.24
7,Pineapples,42,0.84
8,Grapes,19,0.88
9,Bananas,41,0.59
10,Apples,16,1.15
11,Oranges,12,0.97
12,Oranges,39,1.54
13,Grapes,48,1.07
14,Apples,30,0.77


## Step 5: Data Cleaning
`Scenario`: Cleaning Up Inconsistent Sales Data

`Concept`: Handling missing values, removing duplicates

In [59]:
# Creating a simple DataFrame to represent sales data
data = {
    'Product': ['Apples', 'Oranges', 'Oranges'],
    'Quantity': [30, 20, 20],
    'Price': [0.50, 0.80, 0.80]
}


# _list=[0.50, 0.80, 0.30]
# df1=pd.DataFrame(_list)

# print(df1)
#keuys are columns
#list are col values 

df = pd.DataFrame(data)
print(df)

   Product  Quantity  Price
0   Apples        30    0.5
1  Oranges        20    0.8
2  Oranges        20    0.8


In [61]:
# Checking for missing values
# print(df.isnull())

# # Dropping rows with missing values
# cleaned_df = df.dropna()
# print(cleaned_df)

# # Filling missing values
# filled_df = df.fillna({'Product': 'AnyRandomFruit', 'Quantity': 1 ,'Price': df['Price'].mean()})
# print(filled_df)

# # Removing duplicate rows
unique_df = df.drop_duplicates()
print(unique_df)

   Product  Quantity  Price
0   Apples        30    0.5
1  Oranges        20    0.8


## Step 6: Data Transformation
`Scenario`: Adding New Columns to the Sales Data

`Concept`: Adding new columns, modifying existing columns, applying functions.

In [70]:
#Adding a new column for total sales
sales_df['Total Sales'] = sales_df['Quantity'] * sales_df['Price']
print(sales_df)

#Alternative Methdod
for x,y in zip(sales_df['Quantity'],sales_df['Price']):
    print(x*y)


# Modifying an existing column (Broadcasting)
sales_df['IncreasedPrice'] = sales_df['Price'] * 1.10  # Applying a 10% price increase
print(sales_df)

for x in sales_df['Price']:
    print(x*1.10)


# # Applying a function to a column
sales_df['Discounted Price'] = sales_df['Price'].apply(lambda x: x * 0.9 if x > 0.5 else x)
print(sales_df)


          Date     Product  Quantity   Price  Total Sales  IncreasedPrice  \
0   2024-07-13  Pineapples         7  1.1737         6.79         1.29107   
1   2024-07-16  Pineapples        22  1.3310        24.20         1.46410   
2   2024-07-22      Grapes        32  0.9801        25.92         1.07811   
3   2024-07-01  Pineapples        14  0.9438        10.92         1.03818   
4   2024-07-04  Pineapples         8  2.3232        15.36         2.55552   
..         ...         ...       ...     ...          ...             ...   
95  2024-07-26      Apples        40  1.2342        40.80         1.35762   
96  2024-07-11      Apples        36  0.8712        25.92         0.95832   
97  2024-07-21  Pineapples        31  2.3837        61.07         2.62207   
98  2024-07-12      Grapes        30  1.4762        36.60         1.62382   
99  2024-07-05      Grapes        34  1.5125        42.50         1.66375   

    Discounted Price  
0            1.05633  
1            1.19790  
2     

In [73]:
def makeLowerCase(_str):
    return _str.lower()


sales_df['NewProductColumn'] = sales_df['Product'].apply(lambda x: makeLowerCase(x))

#if and else---> Ternary Operation
    
#somecondition? 'yes':'no'

#beverage= age >= 21 ? "Beer" : "Juice";

In [74]:
sales_df

Unnamed: 0,Date,Product,Quantity,Price,Total Sales,IncreasedPrice,Discounted Price,NewProductColumn
0,2024-07-13,Pineapples,7,1.1737,6.79,1.29107,1.05633,pineapples
1,2024-07-16,Pineapples,22,1.3310,24.20,1.46410,1.19790,pineapples
2,2024-07-22,Grapes,32,0.9801,25.92,1.07811,0.88209,grapes
3,2024-07-01,Pineapples,14,0.9438,10.92,1.03818,0.84942,pineapples
4,2024-07-04,Pineapples,8,2.3232,15.36,2.55552,2.09088,pineapples
...,...,...,...,...,...,...,...,...
95,2024-07-26,Apples,40,1.2342,40.80,1.35762,1.11078,apples
96,2024-07-11,Apples,36,0.8712,25.92,0.95832,0.78408,apples
97,2024-07-21,Pineapples,31,2.3837,61.07,2.62207,2.14533,pineapples
98,2024-07-12,Grapes,30,1.4762,36.60,1.62382,1.32858,grapes


## Step 7: Grouping and Aggregation
`Scenario`: Summarizing Sales Data by Product

`Concept`: Grouping data and performing aggregation operations.

In [80]:
#Grouping data by 'Product' and calculating the total quantity sold for each product

#Aggregation Functions:

# Average (also called arithmetic mean)
# Count
# Maximum
# Minimum
# Range
# NaNmean (the mean ignoring NaN values, also known as "nil" or "null")
# Median
# Mode
# Sum


# grouped_df = sales_df.groupby('Product').agg({'Quantity': 'sum'})
# print(grouped_df)

#Grouping data by multiple columns and performing multiple aggregation functions

grouped_multi_df = sales_df.groupby(['Product']).agg({
    'Quantity': ['sum', 'mean']
})
print(grouped_multi_df)


           Quantity           
                sum       mean
Product                       
Apples          703  27.038462
Bananas         612  30.600000
Grapes          663  25.500000
Oranges         283  23.583333
Pineapples      382  23.875000


## Step 8: Merging and Joining
`Scenario`: Combining Sales Data from Multiple Sources

`Concept`: Merging and joining DataFrames

In [87]:
# Creating another DataFrame with additional sales data

data = {
    'ProductId':['a001','o001','b001','a002'], #Primary Key
    'Product': ['Apples', 'Oranges', 'Bananas','Apple'],
    'Quantity': [30, 20, 15,23],
    'Price': [0.50, 0.80, 0.30,0.87]
}

df = pd.DataFrame(data)
print(df)

# more_sales_data = {
#     'Product': ['Apples', 'Oranges', 'Grapes'],
#     'Quantity': [10, 5, 7],
#     'Price': [0.50, 0.80, 1.20]
# }

# more_sales_df = pd.DataFrame(more_sales_data)

# # Merging the two DataFrames on the 'Product' column
# #Outer Join:
# #Merges both the data tables based on a common column.
# #Inner Join/#Left Outer JOIn/Right Outer

# merged_df = pd.merge(df, more_sales_df, on='Product', how='outer', suffixes=('_old', '_new'))
# print(merged_df)

# grouped_df = merged_df.groupby('Product').agg({'Quantity_old': 'sum','Quantity_new': 'sum'})
# print(grouped_df)

# #Joining DataFrames using indices
# joined_df = sales_df.set_index('Product').join(more_sales_df.set_index('Product'), lsuffix='_old', rsuffix='_new')
# print(joined_df)


  ProductId  Product  Quantity  Price
0      a001   Apples        30   0.50
1      o001  Oranges        20   0.80
2      b001  Bananas        15   0.30
3      a002    Apple        23   0.87


## Step 9: Advanced Data Operations
`Scenario`: Analyzing Sales Trends Over Time

`Concept`: Pivot tables, date-time operations.

In [88]:
date_sales_df = sales_df

# Converting 'Date' column to datetime
date_sales_df['Date'] = pd.to_datetime(date_sales_df['Date'])

# Pivot table to summarize sales by date and product

#values='Quantity': The values to be summarized (total quantities sold).
#index='Date': The rows of the pivot table (dates).
#columns='Product': The columns of the pivot table (products).
#aggfunc='sum': The aggregation function to apply (sum of quantities).


pivot_df = date_sales_df.pivot_table(values='Quantity', index='Date', columns='Product', aggfunc='sum')
print(pivot_df)


Product     Apples  Bananas  Grapes  Oranges  Pineapples
Date                                                    
2024-07-01    51.0     49.0    40.0      NaN        14.0
2024-07-02     5.0     25.0     NaN      NaN        37.0
2024-07-03    32.0      NaN     NaN      NaN         NaN
2024-07-04    16.0      NaN     4.0     49.0        73.0
2024-07-05     NaN     35.0    62.0     39.0         NaN
2024-07-06    39.0      NaN    29.0      NaN         NaN
2024-07-07    74.0      NaN     NaN      9.0         NaN
2024-07-08     NaN     21.0    19.0      NaN        54.0
2024-07-09    22.0      NaN     6.0      NaN         NaN
2024-07-10     NaN     49.0    90.0      NaN         NaN
2024-07-11    80.0      NaN     NaN      NaN         NaN
2024-07-12    18.0     47.0    30.0      NaN         NaN
2024-07-13     NaN     45.0     NaN      4.0         7.0
2024-07-14     NaN     23.0     NaN      NaN         NaN
2024-07-15    49.0      4.0     NaN      NaN        84.0
2024-07-16    35.0      NaN    

## `Calculate The Correlation of a Matrix`.



In [90]:
import pandas as pd
import numpy as np



url = "https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv"
titanic_df = pd.read_csv(url)

# Display the first few rows of the dataset
#print(titanic_df.head())


def correlation_coefficient(x, y):
    n = len(x)
    mean_x, mean_y = np.mean(x), np.mean(y)
    numerator = np.sum((x - mean_x) * (y - mean_y))
    denominator = np.sqrt(np.sum((x - mean_x)**2) * np.sum((y - mean_y)**2))
    return numerator / denominator



age = titanic_df['Age'].values
fare = titanic_df['Fare'].values

# Calculate correlation coefficient
r = correlation_coefficient(age, fare)

print(r)

   Survived  Pclass                                               Name  \
0         0       3                             Mr. Owen Harris Braund   
1         1       1  Mrs. John Bradley (Florence Briggs Thayer) Cum...   
2         1       3                              Miss. Laina Heikkinen   
3         1       1        Mrs. Jacques Heath (Lily May Peel) Futrelle   
4         0       3                            Mr. William Henry Allen   

      Sex   Age  Siblings/Spouses Aboard  Parents/Children Aboard     Fare  
0    male  22.0                        1                        0   7.2500  
1  female  38.0                        1                        0  71.2833  
2  female  26.0                        0                        0   7.9250  
3  female  35.0                        1                        0  53.1000  
4    male  35.0                        0                        0   8.0500  
0.11232863699941616
