In [1]:
import pandas as pd

In [2]:
#creating series
s = pd.Series([1,3,5,7,9])
print(s)
# to give own indexing
s = pd.Series(s, index=['a','b','c','d','e'])

0    1
1    3
2    5
3    7
4    9
dtype: int64


In [3]:
# dataframe 
data = {
    "Name" : ["Alice", "Bob", "Charlie", "David"],
    "Age" : [24, 27, 22, 32],
    "City" : ["New York", "Los Angeles", "Chicago", "Houston"],
    "Maths" : [85, 90, 78, 92],
    "Science" : [88, 92, 80, 95],
    "English" : [90, 85, 88, 91]
}

df = pd.DataFrame(data)
print(df)
# to give own indexing
df = pd.DataFrame(data, index=['a', 'b', 'c', 'd'])
print(df)

      Name  Age         City  Maths  Science  English
0    Alice   24     New York     85       88       90
1      Bob   27  Los Angeles     90       92       85
2  Charlie   22      Chicago     78       80       88
3    David   32      Houston     92       95       91
      Name  Age         City  Maths  Science  English
a    Alice   24     New York     85       88       90
b      Bob   27  Los Angeles     90       92       85
c  Charlie   22      Chicago     78       80       88
d    David   32      Houston     92       95       91


In [4]:
# to know basic info about dataframe
print(df.info())
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, a to d
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Name     4 non-null      object
 1   Age      4 non-null      int64 
 2   City     4 non-null      object
 3   Maths    4 non-null      int64 
 4   Science  4 non-null      int64 
 5   English  4 non-null      int64 
dtypes: int64(4), object(2)
memory usage: 224.0+ bytes
None
             Age      Maths  Science    English
count   4.000000   4.000000     4.00   4.000000
mean   26.250000  86.250000    88.75  88.500000
std     4.349329   6.238322     6.50   2.645751
min    22.000000  78.000000    80.00  85.000000
25%    23.500000  83.250000    86.00  87.250000
50%    25.500000  87.500000    90.00  89.000000
75%    28.250000  90.500000    92.75  90.250000
max    32.000000  92.000000    95.00  91.000000


In [5]:
df[["Age", "Maths"]].head(2)

Unnamed: 0,Age,Maths
a,24,85
b,27,90


In [6]:
# Feature Engineering 
# Creating a new column 'Total' which is the sum of 'Maths', 'Science', and 'English'
df['Total'] = df['Maths'] + df['Science'] + df['English']
print(df)
# Adding a percentage column
df["Percentage"] = ((df['Maths'] + df['Science'] + df['English']) / 300) * 100
print(df)

      Name  Age         City  Maths  Science  English  Total
a    Alice   24     New York     85       88       90    263
b      Bob   27  Los Angeles     90       92       85    267
c  Charlie   22      Chicago     78       80       88    246
d    David   32      Houston     92       95       91    278
      Name  Age         City  Maths  Science  English  Total  Percentage
a    Alice   24     New York     85       88       90    263   87.666667
b      Bob   27  Los Angeles     90       92       85    267   89.000000
c  Charlie   22      Chicago     78       80       88    246   82.000000
d    David   32      Houston     92       95       91    278   92.666667


In [7]:

employees = pd.DataFrame({
    "Name": ["John", "Sarah", "Michael", "Emily", "David", "Jessica", "Daniel", "Lisa", "Matthew", "Amanda"],
    "Age": [28, 35, 42, 29, 38, 31, 45, 27, 33, 40],
    "Department": ["IT", "HR", "Finance", "IT", "Marketing", "HR", "Finance", "Marketing", "IT", "Finance"],
    "Salary": [65000, 72000, 85000, 68000, 75000, 70000, 90000, 62000, 78000, 88000],
    "Experience": [3, 8, 15, 4, 10, 6, 18, 2, 7, 12]
})
print(employees)

      Name  Age Department  Salary  Experience
0     John   28         IT   65000           3
1    Sarah   35         HR   72000           8
2  Michael   42    Finance   85000          15
3    Emily   29         IT   68000           4
4    David   38  Marketing   75000          10
5  Jessica   31         HR   70000           6
6   Daniel   45    Finance   90000          18
7     Lisa   27  Marketing   62000           2
8  Matthew   33         IT   78000           7
9   Amanda   40    Finance   88000          12


In [8]:
# loc[]: label based indexing
print(employees.loc[2:6, ["Name", "Age", "Department"] ])

      Name  Age Department
2  Michael   42    Finance
3    Emily   29         IT
4    David   38  Marketing
5  Jessica   31         HR
6   Daniel   45    Finance


In [9]:
# iloc[] : integer based indexing
print(employees.iloc[:2, :4])

    Name  Age Department  Salary
0   John   28         IT   65000
1  Sarah   35         HR   72000


In [10]:
# Boolean based indexing
condition = employees["Age"] > 30
print(employees[condition])

      Name  Age Department  Salary  Experience
1    Sarah   35         HR   72000           8
2  Michael   42    Finance   85000          15
4    David   38  Marketing   75000          10
5  Jessica   31         HR   70000           6
6   Daniel   45    Finance   90000          18
8  Matthew   33         IT   78000           7
9   Amanda   40    Finance   88000          12


In [11]:
# query based indexing
print(employees.query("Age > 40 and Salary>70000"))

      Name  Age Department  Salary  Experience
2  Michael   42    Finance   85000          15
6   Daniel   45    Finance   90000          18


In [12]:
# Creating a dataframe with missing data
missing_data_df = pd.DataFrame({
    "Name": ["Alice", "Bob", "Charlie", "David", "Eve", "Frank"],
    "Age": [25, None, 30, 28, None, 35],
    "Salary": [50000, 60000, None, 55000, 62000, None],
    "Department": ["IT", "HR", None, "Finance", "IT", "Marketing"],
    "Experience": [2, 5, 7, None, 4, 10]
})
print("\nMissing values count:")
print(missing_data_df.isnull().sum())


Missing values count:
Name          0
Age           2
Salary        2
Department    1
Experience    1
dtype: int64


In [13]:
# deleting missing values
# this is to drop columns with any missing values
cleaned_df = missing_data_df.dropna(axis=1) 
 # this is to drop rows with any missing values
cleaned_df = missing_data_df.dropna()
# this is to change actual dataframe
missing_data_df.dropna(inplace=True)
print("\nDataFrame after dropping rows with missing values:")
print(cleaned_df)


DataFrame after dropping rows with missing values:
    Name   Age   Salary Department  Experience
0  Alice  25.0  50000.0         IT         2.0


In [14]:
# filling the value
# filled_value =missing_data_df.fillna(0)
value = missing_data_df["Age"].mean() # we can use mode(), median() also
print(value)
missing_data_df["Age1"] = missing_data_df["Age"].fillna(value)
print(missing_data_df)
# print(filled_value)


25.0
    Name   Age   Salary Department  Experience  Age1
0  Alice  25.0  50000.0         IT         2.0  25.0


In [15]:
# forward fill and backward fill. we generally use both forward and backward fill together to fill missing values as some may not be filled by only one method.
forward_filled_df = missing_data_df.ffill() # this fills the missing value with previous value
backward_filled_df = missing_data_df.bfill() # this fills the missing value with next value

In [16]:
# to remove duplicate values
data_with_duplicates = pd.DataFrame({
    "Name": ["Alice", "Bob", "Charlie", "Alice", "Eve", "Bob"],
    "Age": [25, 30, 30, 25, 28, 30],
    "Salary": [50000, 60000, 60000, 50000, 62000, 60000]
})
print("\nDataFrame with duplicates:")
print(data_with_duplicates)
df_no_duplicates = data_with_duplicates.drop_duplicates()
print("\nDataFrame after removing duplicates:")
print(df_no_duplicates)


DataFrame with duplicates:
      Name  Age  Salary
0    Alice   25   50000
1      Bob   30   60000
2  Charlie   30   60000
3    Alice   25   50000
4      Eve   28   62000
5      Bob   30   60000

DataFrame after removing duplicates:
      Name  Age  Salary
0    Alice   25   50000
1      Bob   30   60000
2  Charlie   30   60000
4      Eve   28   62000


In [17]:

# Create a DataFrame with duplicated values for practice
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'Alice', 'Bob', 'David'],
    'Age': [25, 30, 35, 25, 30, 40],
    'City': ['NYC', 'LA', 'Chicago', 'NYC', 'LA', 'Houston']
}

duplicated_data = pd.DataFrame(data)
print(duplicated_data)

      Name  Age     City
0    Alice   25      NYC
1      Bob   30       LA
2  Charlie   35  Chicago
3    Alice   25      NYC
4      Bob   30       LA
5    David   40  Houston


In [18]:
# to check if there are any duplicates
print(duplicated_data.duplicated().sum())

# to check duplicated data in column 
print(duplicated_data[["Name", "City"]].duplicated())

2
0    False
1    False
2    False
3     True
4     True
5    False
dtype: bool


In [19]:
# to delete duplicated data
new_duplicated_data =duplicated_data.drop_duplicates()
new_duplicated_data.duplicated()

0    False
1    False
2    False
5    False
dtype: bool

In [20]:
# to remove outliner we use IQR method
# Create a DataFrame with outliers
outlier_df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Grace'],
    'Age': [25, 30, 35, 40, 45, 50, 200],  # 200 is an outlier
    'Salary': [50000, 60000, 70000, 80000, 90000, 100000, 5000000]  # 5000000 is an outlier
})

# Calculate Q1, Q3, and IQR for Age and Salary
Q1_age = outlier_df['Age'].quantile(0.25)
Q3_age = outlier_df['Age'].quantile(0.75)
IQR_age = Q3_age - Q1_age

Q1_salary = outlier_df['Salary'].quantile(0.25)
Q3_salary = outlier_df['Salary'].quantile(0.75)
IQR_salary = Q3_salary - Q1_salary

# Define bounds
lower_bound_age = Q1_age - 1.5 * IQR_age
upper_bound_age = Q3_age + 1.5 * IQR_age

lower_bound_salary = Q1_salary - 1.5 * IQR_salary
upper_bound_salary = Q3_salary + 1.5 * IQR_salary

# Filter out outliers
outlier_df_cleaned = outlier_df[
    (outlier_df['Age'] >= lower_bound_age) & (outlier_df['Age'] <= upper_bound_age) &
    (outlier_df['Salary'] >= lower_bound_salary) & (outlier_df['Salary'] <= upper_bound_salary)
]

print(outlier_df_cleaned)

      Name  Age  Salary
0    Alice   25   50000
1      Bob   30   60000
2  Charlie   35   70000
3    David   40   80000
4      Eve   45   90000
5    Frank   50  100000


In [21]:
# Create a DataFrame for testing data aggregation and grouping
agg_df = pd.DataFrame({
    'Department': ['HR', 'IT', 'HR', 'Finance', 'IT', 'Finance', 'HR', 'IT'],
    'Employee': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Grace', 'Henry'],
    'Salary': [50000, 60000, 55000, 70000, 65000, 75000, 52000, 68000],
    'Years': [2, 3, 4, 5, 2, 6, 3, 4]
})
print(agg_df)

  Department Employee  Salary  Years
0         HR    Alice   50000      2
1         IT      Bob   60000      3
2         HR  Charlie   55000      4
3    Finance    David   70000      5
4         IT      Eve   65000      2
5    Finance    Frank   75000      6
6         HR    Grace   52000      3
7         IT    Henry   68000      4


In [22]:
# groupby
department = agg_df.groupby("Department")["Salary"].sum().reset_index()
department
# groupby multiple columns
dept_years = agg_df.groupby(["Department", "Years"])["Salary"].mean().reset_index()
dept_years


Unnamed: 0,Department,Years,Salary
0,Finance,5,70000.0
1,Finance,6,75000.0
2,HR,2,50000.0
3,HR,3,52000.0
4,HR,4,55000.0
5,IT,2,65000.0
6,IT,3,60000.0
7,IT,4,68000.0


In [25]:
# aggegration
agg_value = agg_df.groupby("Department")["Salary"].agg(["sum", "mean", "max", "min"])

#different aggregations for different columns
agg_value1 = agg_df.groupby("Department").agg({
    "Salary" : ["mean", "median", "max"],
    "Years" : "max"
})
agg_value1

Unnamed: 0_level_0,Salary,Salary,Salary,Years
Unnamed: 0_level_1,mean,median,max,max
Department,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Finance,72500.0,72500.0,75000,6
HR,52333.333333,52000.0,55000,4
IT,64333.333333,65000.0,68000,4


In [None]:
# pivot table
pivot_table = agg_df.pivot_table(
    values = ["Salary"], 
    columns=["Department"],
    index= "Employee",
    aggfunc="sum",
    fill_value=0
)
pivot_table

Unnamed: 0_level_0,Salary,Salary,Salary
Department,Finance,HR,IT
Employee,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Alice,0,50000,0
Bob,0,0,60000
Charlie,0,55000,0
David,70000,0,0
Eve,0,0,65000
Frank,75000,0,0
Grace,0,52000,0
Henry,0,0,68000
