In [None]:
# Pandas is a fast, powerful Python library for data manipulation and analysis.
# It provides two main data structures:

# Series ‚Üí 1-D labeled array

# DataFrame ‚Üí 2-D labeled table

# Used for:

# data cleaning

# filtering

# merging/joining

# aggregation

# data transformation


In [None]:
# How to read different file formats in Pandas?

# Answer:

# pd.read_csv("file.csv")
# pd.read_excel("file.xlsx")
# pd.read_json("file.json")
# pd.read_sql(query, connection)
# pd.read_html("url")

In [None]:
# shape = df.shape
# print(shape)  # Output: (3, 2) -- 3 rows and 2 columns

# df.info()   --> The info() method in Pandas provides a concise summary of a DataFrame.



In [None]:
import pandas as pd
data = {'Name': ['Alice', 'Bob', 'Charlie'],
        'Age': [25, 30, 35],
        'City': ['New York', 'Los Angeles', 'Chicago']}
df = pd.DataFrame(data,index=["r1","r2","r3"])

# df["r2"] = ["rohit",25,"India"]

# df["r4"] = ["rohit1",25,"India1"]

print(df[["Age","City"]])

    Age         City
r1   25     New York
r2   30  Los Angeles
r3   35      Chicago


In [None]:
import pandas as pd
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35]
})

# Filter rows where Age is greater than 30
# filtered_df = df[df['Age'] > 30]

df.query("Age > 25 and Name == 'Bob'")
print(df)


filtered_df = df[(df['Age'] > 25) & (df['Name'] == 'Bob')]
print(filtered_df)


      Name  Age
0    Alice   25
1      Bob   30
2  Charlie   35
  Name  Age
1  Bob   30


In [None]:
# What is NaN in Pandas?
# NaN (Not a Number) is a special floating-point value used in Pandas (and NumPy) to represent missing or undefined data.

# It stands for:
# ‚úî Not a Number
# ‚úî Represents missing, null, or invalid numerical values
# ‚úî Comes from the IEEE 754 floating-point standard


# | None                                    | NaN                             |
# | --------------------------------------- | ------------------------------- |
# | Python object                           | Float (NumPy)                   |
# | Used for missing values in object dtype | Used for missing numeric values |




# 16. How do you handle missing data in Pandas?

# A. Detect Missing Values --> df.isnull()          # Boolean mask of missing values
#                              df.isnull().sum()    # Count missing values per column

# B.Drop Missing Values -->   df.dropna()                   # Drop rows with any NaN
                            # df.dropna(how="all")          # Drop rows where all values are NaN
                            # df.dropna(subset=["age"])     # Drop rows based on specific column
                            # df.dropna(axis=1)             # Drop entire columns with NaN

# Note: By default, the dropna() method returns a new DataFrame, and will not change the original.
# If you want to change the original DataFrame, use the inplace = True argument:

# Note: Now, the dropna(inplace = True) will NOT return a new DataFrame, but it will remove all rows containing NULL values from the original DataFrame.
# import pandas as pd
# df = pd.read_csv('data.csv')
# df.dropna(inplace = True)

# ‚úÖ What does inplace=True mean in Pandas?
# inplace=True means that the operation will modify the original DataFrame directly instead of creating and returning a new on


# 3Ô∏è‚É£ Fill Missing Values --> df.fillna(0)                  # Replace NaN with a fixed value
                            # df.fillna(df.mean())          # Replace with mean (numeric)
                            # df.fillna(df.median())        # Replace with median
                            # df.fillna(df.mode().iloc[0])  # Replace with mode (categorical)





In [None]:
# df.dropna(axis=1) in Pandas

# df.dropna(axis=1) removes (drops) entire columns that contain ANY missing values (NaN).

# ‚≠ê Detailed Explanation

# axis=0 ‚Üí drop rows

# axis=1 ‚Üí drop columns

# So this command deletes all columns where at least one value is NaN.

In [None]:
# We sort a DataFrame using df.sort_values(). It supports sorting by one or multiple columns, specifying ascending or descending order, handling NaN positions, and performing inplace updates. For example: df.sort_values(['col1','col2'], ascending=[True, False]).‚Äù

In [None]:
# ‚úÖ What does iloc[] and loc[] do in Pandas?

# Pandas provides two powerful indexers for selecting data:
# ‚úî loc[] ‚Üí label-based selection
# ‚úî iloc[] ‚Üí integer/position-based selection



In [None]:
# NumPy is faster than Python lists because it is built for numerical computation at the machine level, unlike Python lists which are high-level and slow.

NumPy is faster than Python lists because it stores data in contiguous memory, uses optimized C/Fortran backend, avoids Python loops through vectorization, and works with homogeneous data. This allows CPU-level optimizations and much faster numerical computation.

In [None]:
import pandas as pd
import numpy as np
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, np.nan, 35]
})

print(df.isna())  # Check for NaN values


# ‚úî Why NaN gives error?
# Because:
# Python doesn't know what NaN is.
# NumPy defines it as np.nan.

In [None]:
# üéØ INTERVIEW 20-sec Answer

# In merge(), column order doesn‚Äôt matter, but the join column name must match.
# In concat(), column order doesn‚Äôt matter, but if column names differ, Pandas aligns them by name and fills missing values with NaN.