In [5]:
import pandas as pd
import numpy as np

In [6]:
# Create sample dataset for exploration
np.random.seed(42)
df = pd.DataFrame({
    'A': np.random.randn(100),
    'B': np.random.randint(1, 10, 100),
    'C': np.random.choice(['X', 'Y', 'Z'], 100),
    'D': pd.date_range('2023-01-01', periods=100, freq='D'),
    'E': np.random.randn(100) * 100
})

In [7]:
df

Unnamed: 0,A,B,C,D,E
0,0.496714,9,Y,2023-01-01,23.229616
1,-0.138264,5,Y,2023-01-02,17.618092
2,0.647689,1,Y,2023-01-03,-115.256537
3,1.523030,3,Y,2023-01-04,-150.076839
4,-0.234153,8,Z,2023-01-05,16.502280
...,...,...,...,...,...
95,-1.463515,3,Z,2023-04-06,94.914299
96,0.296120,9,X,2023-04-07,-23.922480
97,0.261055,3,X,2023-04-08,85.397471
98,0.005113,9,X,2023-04-09,-27.104046


In [8]:
# Basic information about DataFrame
print(df.head())        # First 5 rows
print(df.tail())        # Last 5 rows
print(df.shape)         # Dimensions (rows, columns)
print(df.info())        # Data types and non-null counts
print(df.columns)       # Column names
print(df.index)         # Index information
print(df.dtypes)        # Data types of each column

          A  B  C          D           E
0  0.496714  9  Y 2023-01-01   23.229616
1 -0.138264  5  Y 2023-01-02   17.618092
2  0.647689  1  Y 2023-01-03 -115.256537
3  1.523030  3  Y 2023-01-04 -150.076839
4 -0.234153  8  Z 2023-01-05   16.502280
           A  B  C          D          E
95 -1.463515  3  Z 2023-04-06  94.914299
96  0.296120  9  X 2023-04-07 -23.922480
97  0.261055  3  X 2023-04-08  85.397471
98  0.005113  9  X 2023-04-09 -27.104046
99 -0.234587  2  Y 2023-04-10 -61.309468
(100, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   A       100 non-null    float64       
 1   B       100 non-null    int32         
 2   C       100 non-null    object        
 3   D       100 non-null    datetime64[ns]
 4   E       100 non-null    float64       
dtypes: datetime64[ns](1), float64(2), int32(1), object(1)
memory usage: 3.6+ KB
None
In

In [9]:
# Sample random rows
print(df.sample(5))     # 5 random rows
print(df.sample(frac=0.1))  # 10% of rows

           A  B  C          D           E
87  0.328751  8  Y 2023-03-29  116.193732
67  1.003533  3  Y 2023-03-09    7.097242
43 -0.301104  6  Y 2023-02-13  187.200568
61 -0.185659  8  Z 2023-03-03   12.609483
88 -0.529760  4  Z 2023-03-30  155.285978
           A  B  C          D           E
83 -0.518270  8  Z 2023-03-25  -71.012796
91  0.968645  4  Y 2023-04-02   26.162306
68  0.361636  7  Z 2023-03-10   43.932714
94 -0.392108  4  Y 2023-04-05  -52.318809
6   1.579213  8  X 2023-01-07   -3.963614
88 -0.529760  4  Z 2023-03-30  155.285978
58  0.331263  8  X 2023-02-28  -20.245753
19 -1.412304  1  Y 2023-01-20  184.341376
79 -1.987569  3  Y 2023-03-21   58.131413
47  1.057122  3  X 2023-02-17  -10.511850


In [10]:
# Descriptive statistics
print(df.describe())              # Summary statistics for numeric columns
print(df.describe(include='all')) # All columns including categorical

                A           B                    D           E
count  100.000000  100.000000                  100  100.000000
mean    -0.103847    4.730000  2023-02-19 12:00:00    8.073564
min     -2.619745    1.000000  2023-01-01 00:00:00 -199.496157
25%     -0.600906    2.000000  2023-01-25 18:00:00  -61.360443
50%     -0.126956    4.000000  2023-02-19 12:00:00    3.682609
75%      0.405952    7.000000  2023-03-16 06:00:00   85.966970
max      1.852278    9.000000  2023-04-10 00:00:00  238.903596
std      0.908168    2.733463                  NaN  101.447471
                 A           B    C                    D           E
count   100.000000  100.000000  100                  100  100.000000
unique         NaN         NaN    3                  NaN         NaN
top            NaN         NaN    X                  NaN         NaN
freq           NaN         NaN   36                  NaN         NaN
mean     -0.103847    4.730000  NaN  2023-02-19 12:00:00    8.073564
min      -2.619745 

In [11]:
# Individual statistics
print(df['A'].mean())       # Mean
print(df['A'].median())     # Median
print(df['A'].mode())       # Mode
print(df['A'].std())        # Standard deviation
print(df['A'].var())        # Variance
print(df['A'].min())        # Minimum
print(df['A'].max())        # Maximum


-0.10384651739409384
-0.1269562917797126
0    -2.619745
1    -1.987569
2    -1.959670
3    -1.913280
4    -1.763040
        ...   
95    1.523030
96    1.538037
97    1.564644
98    1.579213
99    1.852278
Name: A, Length: 100, dtype: float64
0.9081684280078008
0.8247698936301602
-2.6197451040897444
1.8522781845089378


In [12]:
df["A"]

0     0.496714
1    -0.138264
2     0.647689
3     1.523030
4    -0.234153
        ...   
95   -1.463515
96    0.296120
97    0.261055
98    0.005113
99   -0.234587
Name: A, Length: 100, dtype: float64

In [21]:
print(df['A'].quantile(0.75))  # 25th percentile

0.40595205201206214


In [22]:
# Count statistics
print(df['C'].value_counts())    # Count of unique values

C
X    36
Y    35
Z    29
Name: count, dtype: int64


In [23]:
print(df['C'].nunique())         # Number of unique values

3


In [24]:
print(df.isnull().sum())         # Count of missing value

A    0
B    0
C    0
D    0
E    0
dtype: int64


In [25]:
print(df.notnull().sum())        # Count of non-missing values

A    100
B    100
C    100
D    100
E    100
dtype: int64


In [26]:
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
    'Age': [25, 30, 35, 28, 32],
    'Department': ['HR', 'IT', 'Finance', 'HR', 'IT'],
    'Salary': [70000, 80000, 90000, 75000, 85000]
}
df = pd.DataFrame(data, index=['emp1', 'emp2', 'emp3', 'emp4', 'emp5'])

In [27]:
df

Unnamed: 0,Name,Age,Department,Salary
emp1,Alice,25,HR,70000
emp2,Bob,30,IT,80000
emp3,Charlie,35,Finance,90000
emp4,Diana,28,HR,75000
emp5,Eve,32,IT,85000


In [29]:
df[df['Age'] > 30]

Unnamed: 0,Name,Age,Department,Salary
emp3,Charlie,35,Finance,90000
emp5,Eve,32,IT,85000


In [31]:
df[(df['Age'] > 25) & (df['Department'] == 'IT')]

Unnamed: 0,Name,Age,Department,Salary
emp2,Bob,30,IT,80000
emp5,Eve,32,IT,85000


In [32]:
df[(df['Age'] > 25) | (df['Department'] == 'IT')]

Unnamed: 0,Name,Age,Department,Salary
emp2,Bob,30,IT,80000
emp3,Charlie,35,Finance,90000
emp4,Diana,28,HR,75000
emp5,Eve,32,IT,85000


In [33]:
# Using isin() method
departments = ['HR', 'Finance']
df[df['Department'].isin(departments)]

Unnamed: 0,Name,Age,Department,Salary
emp1,Alice,25,HR,70000
emp3,Charlie,35,Finance,90000
emp4,Diana,28,HR,75000


In [37]:
# String operations in boolean indexing
df[df['Name'].str.contains('a')]   # Names containing 'A'

Unnamed: 0,Name,Age,Department,Salary
emp3,Charlie,35,Finance,90000
emp4,Diana,28,HR,75000


In [36]:
df[df['Name'].str.startswith('A')]  # Names starting with 'A'

Unnamed: 0,Name,Age,Department,Salary
emp1,Alice,25,HR,70000


In [40]:
print(df.query('Age > 30'))

         Name  Age Department  Salary
emp3  Charlie   35    Finance   90000
emp5      Eve   32         IT   85000


In [41]:
print(df.query('Age > 30 and Department == "IT"'))

     Name  Age Department  Salary
emp5  Eve   32         IT   85000


In [42]:
data_with_na = {
    'A': [1, 2, np.nan, 4, 5],
    'B': [np.nan, 2, 3, 4, np.nan],
    'C': ['x', 'y', None, 'w', 'z']
}
df_na = pd.DataFrame(data_with_na)

In [43]:
df_na

Unnamed: 0,A,B,C
0,1.0,,x
1,2.0,2.0,y
2,,3.0,
3,4.0,4.0,w
4,5.0,,z


In [44]:
df_na.isnull().sum()

A    1
B    2
C    1
dtype: int64

In [45]:
# Check for missing values
print(df_na.isnull())        # Boolean mask of missing values
print(df_na.notnull())       # Boolean mask of non-missing values
print(df_na.isnull().sum())  # Count of missing values per column


       A      B      C
0  False   True  False
1  False  False  False
2   True  False   True
3  False  False  False
4  False   True  False
       A      B      C
0   True  False   True
1   True   True   True
2  False   True  False
3   True   True   True
4   True  False   True
A    1
B    2
C    1
dtype: int64


In [46]:
# Drop missing values
print(df_na.dropna())                    # Drop rows with any NaN

     A    B  C
1  2.0  2.0  y
3  4.0  4.0  w


In [47]:
print(df_na.dropna(axis=1))              # Drop columns with any NaN

Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4]


In [48]:
print(df_na.dropna(how='all'))           # Drop rows where all values are NaN

     A    B     C
0  1.0  NaN     x
1  2.0  2.0     y
2  NaN  3.0  None
3  4.0  4.0     w
4  5.0  NaN     z


In [49]:
print(df_na.dropna(thresh=2))            # Drop rows with less than 2 non-NaN values

     A    B  C
0  1.0  NaN  x
1  2.0  2.0  y
3  4.0  4.0  w
4  5.0  NaN  z


In [50]:
# Fill missing values
print(df_na.fillna(0))                   # Fill with constant value

     A    B  C
0  1.0  0.0  x
1  2.0  2.0  y
2  0.0  3.0  0
3  4.0  4.0  w
4  5.0  0.0  z


In [51]:
print(df_na.fillna(method='ffill'))      # Forward fill

     A    B  C
0  1.0  NaN  x
1  2.0  2.0  y
2  2.0  3.0  y
3  4.0  4.0  w
4  5.0  4.0  z


  print(df_na.fillna(method='ffill'))      # Forward fill


In [52]:
print(df_na.fillna(method='bfill'))      # Backward fill

     A    B  C
0  1.0  2.0  x
1  2.0  2.0  y
2  4.0  3.0  w
3  4.0  4.0  w
4  5.0  NaN  z


  print(df_na.fillna(method='bfill'))      # Backward fill


In [54]:
print(df_na["A"].fillna(df_na["A"].mean()))        # Fill with mean (numeric columns only

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
Name: A, dtype: float64


In [55]:
df_types = pd.DataFrame({
    'integers': ['1', '2', '3', '4'],
    'floats': ['1.1', '2.2', '3.3', '4.4'],
    'dates': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04'],
    'categories': ['A', 'B', 'A', 'C']
})

In [56]:
df_types

Unnamed: 0,integers,floats,dates,categories
0,1,1.1,2023-01-01,A
1,2,2.2,2023-01-02,B
2,3,3.3,2023-01-03,A
3,4,4.4,2023-01-04,C


In [57]:
df_types.dtypes

integers      object
floats        object
dates         object
categories    object
dtype: object

In [58]:
# Convert data types
df_types['integers'] = df_types['integers'].astype(int)

In [59]:
df_types['integers'].dtype

dtype('int32')

In [62]:
df_types['floats']= df_types['floats'].astype(float)

In [67]:
df_types.dtypes

integers               int32
floats               float64
dates         datetime64[ns]
categories          category
dtype: object

In [64]:
df_types['dates'] = pd.to_datetime(df_types['dates'])

In [66]:
df_types['categories'] = df_types['categories'].astype('category')

In [68]:
df_mixed = pd.DataFrame({'col': ['1', '2', 'three', '4']})


In [69]:
df_mixed

Unnamed: 0,col
0,1
1,2
2,three
3,4


In [70]:
df_mixed["col"].astype(int)

ValueError: invalid literal for int() with base 10: 'three'

In [71]:
df_mixed['col_numeric'] = pd.to_numeric(df_mixed['col'], errors='coerce')  # NaN for non-numeric
df_mixed

Unnamed: 0,col,col_numeric
0,1,1.0
1,2,2.0
2,three,
3,4,4.0


In [72]:
# Create DataFrame with string data
df_str = pd.DataFrame({
    'names': ['  Alice  ', 'BOB', 'charlie', 'DIANA'],
    'emails': ['alice@email.com', 'bob@COMPANY.COM', 'charlie@email.com', 'diana@company.com']
})



In [73]:
df_str

Unnamed: 0,names,emails
0,Alice,alice@email.com
1,BOB,bob@COMPANY.COM
2,charlie,charlie@email.com
3,DIANA,diana@company.com


In [74]:
# String methods (vectorized operations)
print(df_str['names'].str.lower())       # Convert to lowercase
print(df_str['names'].str.upper())       # Convert to uppercase
print(df_str['names'].str.title())       # Title case
print(df_str['names'].str.strip())       # Remove leading/trailing whitespace
print(df_str['names'].str.len())         # Length of strings

# String manipulation

print(df_str['emails'].str.split('@'))          # Split strings
print(df_str['emails'].str.split('@', expand=True))  # Split into columns

# String filtering
print(df_str[df_str['emails'].str.contains('company')])
print(df_str[df_str['names'].str.startswith('A')])
print(df_str[df_str['emails'].str.endswith('.com')])

0      alice  
1          bob
2      charlie
3        diana
Name: names, dtype: object
0      ALICE  
1          BOB
2      CHARLIE
3        DIANA
Name: names, dtype: object
0      Alice  
1          Bob
2      Charlie
3        Diana
Name: names, dtype: object
0      Alice
1        BOB
2    charlie
3      DIANA
Name: names, dtype: object
0    9
1    3
2    7
3    5
Name: names, dtype: int64
0      [alice, email.com]
1      [bob, COMPANY.COM]
2    [charlie, email.com]
3    [diana, company.com]
Name: emails, dtype: object
         0            1
0    alice    email.com
1      bob  COMPANY.COM
2  charlie    email.com
3    diana  company.com
   names             emails
3  DIANA  diana@company.com
Empty DataFrame
Columns: [names, emails]
Index: []
       names             emails
0    Alice      alice@email.com
2    charlie  charlie@email.com
3      DIANA  diana@company.com


In [75]:
df_str['names'].str.replace(' ', '_')   # Replace characters

0    __Alice__
1          BOB
2      charlie
3        DIANA
Name: names, dtype: object

In [77]:
df_mixed["col"].str.replace("three", "3")

0    1
1    2
2    3
3    4
Name: col, dtype: object

In [78]:
df_dup = pd.DataFrame({
    'A': [1, 2, 2, 3, 3, 3],
    'B': ['x', 'y', 'y', 'z', 'z', 'w'],
    'C': [10, 20, 20, 30, 30, 40]
})

In [79]:
df_dup

Unnamed: 0,A,B,C
0,1,x,10
1,2,y,20
2,2,y,20
3,3,z,30
4,3,z,30
5,3,w,40


In [80]:
# Check for duplicates
print(df_dup.duplicated())                    # Boolean mask


0    False
1    False
2     True
3    False
4     True
5    False
dtype: bool


In [81]:
print(df_dup.duplicated(subset=['A', 'B']))   # Check specific columns


0    False
1    False
2     True
3    False
4     True
5    False
dtype: bool


In [82]:
print(df_dup.duplicated(keep='last'))         # Mark first occurrence as duplicate

0    False
1     True
2    False
3     True
4    False
5    False
dtype: bool


In [83]:
# Remove duplicates
print(df_dup.drop_duplicates())               # Remove duplicate rows

   A  B   C
0  1  x  10
1  2  y  20
3  3  z  30
5  3  w  40


In [84]:
print(df_dup.drop_duplicates(subset=['A']))   # Based on specific columns

   A  B   C
0  1  x  10
1  2  y  20
3  3  z  30


In [85]:
print(df_dup.drop_duplicates(keep='last'))    # Keep last occurrence

   A  B   C
0  1  x  10
2  2  y  20
4  3  z  30
5  3  w  40


In [86]:
# Create base DataFrame
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35],
    'Salary': [70000, 80000, 90000]
})

In [87]:
df

Unnamed: 0,Name,Age,Salary
0,Alice,25,70000
1,Bob,30,80000
2,Charlie,35,90000


In [88]:
df['Bonus'] = df['Salary'] * 0.1           # Calculated column

In [89]:
df

Unnamed: 0,Name,Age,Salary,Bonus
0,Alice,25,70000,7000.0
1,Bob,30,80000,8000.0
2,Charlie,35,90000,9000.0


In [90]:
df['Department'] = 'IT'                     # Constant value

In [91]:
df

Unnamed: 0,Name,Age,Salary,Bonus,Department
0,Alice,25,70000,7000.0,IT
1,Bob,30,80000,8000.0,IT
2,Charlie,35,90000,9000.0,IT


In [92]:
df['Full_Info'] = df['Name'] + ' (' + df['Age'].astype(str) + ')'  # String concatenation

In [93]:
df

Unnamed: 0,Name,Age,Salary,Bonus,Department,Full_Info
0,Alice,25,70000,7000.0,IT,Alice (25)
1,Bob,30,80000,8000.0,IT,Bob (30)
2,Charlie,35,90000,9000.0,IT,Charlie (35)


In [94]:
# Insert column at specific position
df.insert(1, 'Employee_ID', ['E001', 'E002', 'E003'])

In [95]:
df

Unnamed: 0,Name,Employee_ID,Age,Salary,Bonus,Department,Full_Info
0,Alice,E001,25,70000,7000.0,IT,Alice (25)
1,Bob,E002,30,80000,8000.0,IT,Bob (30)
2,Charlie,E003,35,90000,9000.0,IT,Charlie (35)


In [97]:
df.drop("Department", axis = 1)

Unnamed: 0,Name,Employee_ID,Age,Salary,Bonus,Full_Info
0,Alice,E001,25,70000,7000.0,Alice (25)
1,Bob,E002,30,80000,8000.0,Bob (30)
2,Charlie,E003,35,90000,9000.0,Charlie (35)


In [98]:
df

Unnamed: 0,Name,Employee_ID,Age,Salary,Bonus,Department,Full_Info
0,Alice,E001,25,70000,7000.0,IT,Alice (25)
1,Bob,E002,30,80000,8000.0,IT,Bob (30)
2,Charlie,E003,35,90000,9000.0,IT,Charlie (35)


In [99]:
df.drop("Department", axis = 1, inplace= True)

In [100]:
df

Unnamed: 0,Name,Employee_ID,Age,Salary,Bonus,Full_Info
0,Alice,E001,25,70000,7000.0,Alice (25)
1,Bob,E002,30,80000,8000.0,Bob (30)
2,Charlie,E003,35,90000,9000.0,Charlie (35)


In [101]:
df.drop(0)

Unnamed: 0,Name,Employee_ID,Age,Salary,Bonus,Full_Info
1,Bob,E002,30,80000,8000.0,Bob (30)
2,Charlie,E003,35,90000,9000.0,Charlie (35)


In [102]:
# Rename specific columns
df_renamed = df.rename(columns={'Name': 'Employee_Name', 'Age': 'Employee_Age'})

In [103]:
df_renamed

Unnamed: 0,Employee_Name,Employee_ID,Employee_Age,Salary,Bonus,Full_Info
0,Alice,E001,25,70000,7000.0,Alice (25)
1,Bob,E002,30,80000,8000.0,Bob (30)
2,Charlie,E003,35,90000,9000.0,Charlie (35)


In [104]:
# Create sample data for sorting
df_sort = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'Diana'],
    'Age': [25, 35, 30, 28],
    'Salary': [70000, 90000, 80000, 75000],
    'Department': ['HR', 'IT', 'Finance', 'HR']
})

In [105]:
df_sort

Unnamed: 0,Name,Age,Salary,Department
0,Alice,25,70000,HR
1,Bob,35,90000,IT
2,Charlie,30,80000,Finance
3,Diana,28,75000,HR


In [None]:
# Sort by single column
print(df_sort.sort_values('Age'))                    # Ascending
print(df_sort.sort_values('Age', ascending=False))   # Descending