**1. Pandas Basics**

In [1]:
# importing pandas library
import pandas as pd

# Sample DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Age': [25, 30, 35, 40, 29],
    'Gender': ['F', 'M', 'M', 'M', 'F'],
    'Score': [85, 90, None, 88, 76]
}

df = pd.DataFrame(data)

In [2]:
# read_csv (simulated with df.to_csv and read it again)

df.to_csv("sample.csv")
df_read = pd.read_csv("sample.csv")
df_read.head()

Unnamed: 0.1,Unnamed: 0,Name,Age,Gender,Score
0,0,Alice,25,F,85.0
1,1,Bob,30,M,90.0
2,2,Charlie,35,M,
3,3,David,40,M,88.0
4,4,Eve,29,F,76.0


In [3]:
# Using a new Larger dataset (csv file)
df = pd.read_csv('complex_data.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'complex_data.csv'

In [None]:
# head and tail

print(df.head())
print(df.tail())

In [None]:
# info()

df.info()

In [None]:
# describe()

df.describe()

In [None]:
# rename()

df_renamed = df.rename(columns={"Score": "Final_Score", 
                                "Age": "Fake_Age"})
df_renamed.head()

In [None]:
# loc (label-based selection)

df.loc[0:2, ['Score', 'Name']]

In [4]:
# iloc (index-based selection)

df.iloc[0:3, 0:2]

Unnamed: 0,Name,Age
0,Alice,25
1,Bob,30
2,Charlie,35


In [5]:
# unique and nunique

print(df['Gender'].unique())
print(df['Gender'].nunique())

['F' 'M']
2


**2. Data Preprocessing & Cleaning**

In [8]:
# isna / isnull
# df.isna()
df.isnull().sum()

Name      0
Age       0
Gender    0
Score     1
dtype: int64

In [9]:
# dropna
df_dropped = df.dropna()
df_dropped

Unnamed: 0,Name,Age,Gender,Score
0,Alice,25,F,85.0
1,Bob,30,M,90.0
3,David,40,M,88.0
4,Eve,29,F,76.0


In [56]:
# fillna
df_filled = df.fillna(df['Score'].mean())
df_filled

Unnamed: 0,Name,Age,Gender,Score,City,Occupation,EnrollmentDate,CompletedCourses,IsStudent,GPA,AnnualIncome_USD
0,Alice,25,F,85.0,New York,Engineer,2020-01-15,3,False,3.5,70000
1,Bob,30,M,90.0,London,Designer,2018-05-20,5,False,3.8,85000
2,Charlie,35,M,84.625,Paris,Doctor,2021-03-10,4,False,3.2,120000
3,David,40,M,88.0,New York,Manager,2017-11-01,7,False,3.9,100000
4,Eve,29,F,76.0,Berlin,Analyst,2019-07-22,2,False,3.0,65000
5,Frank,22,M,92.0,London,Student,2023-09-05,1,True,3.7,15000
6,Grace,31,F,81.0,Paris,Artist,2022-02-18,3,False,3.1,50000
7,Heidi,28,F,84.625,Tokyo,84.625,2016-04-30,6,False,84.625,90000
8,Ivan,45,M,95.0,Berlin,Engineer,2020-10-12,8,False,4.0,110000
9,Julia,27,F,70.0,New York,Developer,2021-06-25,2,False,2.9,75000


**3. IQR**

In [59]:
import numpy as np

csv_file_name = 'complex_data.csv'
df = pd.read_csv(csv_file_name)

column_to_analyze = 'AnnualIncome_USD'

Q1 = df[column_to_analyze].quantile(0.25)
Q3 = df[column_to_analyze].quantile(0.75)

iqr = Q3 - Q1

upper_limit = Q3 + 1.5 * iqr
lower_limit = Q1 - 1.5 * iqr

outliers_df = df[(df[column_to_analyze] < lower_limit) | (df[column_to_analyze] > upper_limit)]

print(f"Outliers in '{column_to_analyze}':")
if not outliers_df.empty:
    print(outliers_df)
else:
    print("No outliers found in this column based on the IQR method.")

Outliers in 'AnnualIncome_USD':
    Name  Age Gender  Score    City Occupation EnrollmentDate  \
5  Frank   22      M   92.0  London    Student     2023-09-05   

   CompletedCourses  IsStudent  GPA  AnnualIncome_USD  
5                 1       True  3.7             15000  
