In [None]:
import pandas as pd

In [None]:
#creating series
s = pd.Series([1,3,5,7,9])
print(s)
# to give own indexing
s = pd.Series(s, index=['a','b','c','d','e'])

In [None]:
# dataframe 
data = {
    "Name" : ["Alice", "Bob", "Charlie", "David"],
    "Age" : [24, 27, 22, 32],
    "City" : ["New York", "Los Angeles", "Chicago", "Houston"],
    "Maths" : [85, 90, 78, 92],
    "Science" : [88, 92, 80, 95],
    "English" : [90, 85, 88, 91]
}

df = pd.DataFrame(data)
print(df)
# to give own indexing
df = pd.DataFrame(data, index=['a', 'b', 'c', 'd'])
print(df)

In [None]:
# to know basic info about dataframe
print(df.info())
print(df.describe())

In [None]:
df[["Age", "Maths"]].head(2)

In [None]:
# Feature Engineering 
# Creating a new column 'Total' which is the sum of 'Maths', 'Science', and 'English'
df['Total'] = df['Maths'] + df['Science'] + df['English']
print(df)
# Adding a percentage column
df["Percentage"] = ((df['Maths'] + df['Science'] + df['English']) / 300) * 100
print(df)

In [None]:

employees = pd.DataFrame({
    "Name": ["John", "Sarah", "Michael", "Emily", "David", "Jessica", "Daniel", "Lisa", "Matthew", "Amanda"],
    "Age": [28, 35, 42, 29, 38, 31, 45, 27, 33, 40],
    "Department": ["IT", "HR", "Finance", "IT", "Marketing", "HR", "Finance", "Marketing", "IT", "Finance"],
    "Salary": [65000, 72000, 85000, 68000, 75000, 70000, 90000, 62000, 78000, 88000],
    "Experience": [3, 8, 15, 4, 10, 6, 18, 2, 7, 12]
})
print(employees)

In [None]:
# loc[]: label based indexing
print(employees.loc[2:6, ["Name", "Age", "Department"] ])

In [None]:
# iloc[] : integer based indexing
print(employees.iloc[:2, :4])

In [None]:
# Boolean based indexing
condition = employees["Age"] > 30
print(employees[condition])

In [None]:
# query based indexing
print(employees.query("Age > 40 and Salary>70000"))

In [None]:
# Creating a dataframe with missing data
missing_data_df = pd.DataFrame({
    "Name": ["Alice", "Bob", "Charlie", "David", "Eve", "Frank"],
    "Age": [25, None, 30, 28, None, 35],
    "Salary": [50000, 60000, None, 55000, 62000, None],
    "Department": ["IT", "HR", None, "Finance", "IT", "Marketing"],
    "Experience": [2, 5, 7, None, 4, 10]
})
print("\nMissing values count:")
print(missing_data_df.isnull().sum())

In [None]:
# deleting missing values
# this is to drop columns with any missing values
cleaned_df = missing_data_df.dropna(axis=1) 
 # this is to drop rows with any missing values
cleaned_df = missing_data_df.dropna()
# this is to change actual dataframe
missing_data_df.dropna(inplace=True)
print("\nDataFrame after dropping rows with missing values:")
print(cleaned_df)

In [None]:
# filling the value
# filled_value =missing_data_df.fillna(0)
value = missing_data_df["Age"].mean() # we can use mode(), median() also
print(value)
missing_data_df["Age1"] = missing_data_df["Age"].fillna(value)
print(missing_data_df)
# print(filled_value)


In [None]:
# forward fill and backward fill. we generally use both forward and backward fill together to fill missing values as some may not be filled by only one method.
forward_filled_df = missing_data_df.ffill() # this fills the missing value with previous value
backward_filled_df = missing_data_df.bfill() # this fills the missing value with next value

In [None]:
# to remove duplicate values
data_with_duplicates = pd.DataFrame({
    "Name": ["Alice", "Bob", "Charlie", "Alice", "Eve", "Bob"],
    "Age": [25, 30, 30, 25, 28, 30],
    "Salary": [50000, 60000, 60000, 50000, 62000, 60000]
})
print("\nDataFrame with duplicates:")
print(data_with_duplicates)
df_no_duplicates = data_with_duplicates.drop_duplicates()
print("\nDataFrame after removing duplicates:")
print(df_no_duplicates)

In [None]:

# Create a DataFrame with duplicated values for practice
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'Alice', 'Bob', 'David'],
    'Age': [25, 30, 35, 25, 30, 40],
    'City': ['NYC', 'LA', 'Chicago', 'NYC', 'LA', 'Houston']
}

duplicated_data = pd.DataFrame(data)
print(duplicated_data)

In [None]:
# to check if there are any duplicates
print(duplicated_data.duplicated().sum())

# to check duplicated data in column 
print(duplicated_data[["Name", "City"]].duplicated())

In [None]:
# to delete duplicated data
new_duplicated_data =duplicated_data.drop_duplicates()
new_duplicated_data.duplicated()

In [None]:
# to remove outliner we use IQR method
# Create a DataFrame with outliers
outlier_df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Grace'],
    'Age': [25, 30, 35, 40, 45, 50, 200],  # 200 is an outlier
    'Salary': [50000, 60000, 70000, 80000, 90000, 100000, 5000000]  # 5000000 is an outlier
})

# Calculate Q1, Q3, and IQR for Age and Salary
Q1_age = outlier_df['Age'].quantile(0.25)
Q3_age = outlier_df['Age'].quantile(0.75)
IQR_age = Q3_age - Q1_age

Q1_salary = outlier_df['Salary'].quantile(0.25)
Q3_salary = outlier_df['Salary'].quantile(0.75)
IQR_salary = Q3_salary - Q1_salary

# Define bounds
lower_bound_age = Q1_age - 1.5 * IQR_age
upper_bound_age = Q3_age + 1.5 * IQR_age

lower_bound_salary = Q1_salary - 1.5 * IQR_salary
upper_bound_salary = Q3_salary + 1.5 * IQR_salary

# Filter out outliers
outlier_df_cleaned = outlier_df[
    (outlier_df['Age'] >= lower_bound_age) & (outlier_df['Age'] <= upper_bound_age) &
    (outlier_df['Salary'] >= lower_bound_salary) & (outlier_df['Salary'] <= upper_bound_salary)
]

print(outlier_df_cleaned)

In [None]:
# Create a DataFrame for testing data aggregation and grouping
agg_df = pd.DataFrame({
    'Department': ['HR', 'IT', 'HR', 'Finance', 'IT', 'Finance', 'HR', 'IT'],
    'Employee': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank', 'Grace', 'Henry'],
    'Salary': [50000, 60000, 55000, 70000, 65000, 75000, 52000, 68000],
    'Years': [2, 3, 4, 5, 2, 6, 3, 4]
})
print(agg_df)

In [None]:
# groupby
department = agg_df.groupby("Department")["Salary"].sum().reset_index()
department
# groupby multiple columns
dept_years = agg_df.groupby(["Department", "Years"])["Salary"].mean().reset_index()
dept_years


In [None]:
# aggegration
agg_value = agg_df.groupby("Department")["Salary"].agg(["sum", "mean", "max", "min"])

#different aggregations for different columns
agg_value1 = agg_df.groupby("Department").agg({
    "Salary" : ["mean", "median", "max"],
    "Years" : "max"
})
agg_value1

In [None]:
# pivot table
pivot_table = agg_df.pivot_table(
    values = ["Salary"], 
    columns=["Department"],
    index= "Employee",
    aggfunc="sum",
    fill_value=0
)
pivot_table

In [None]:
# Merging and Joining DataFrames
df1 = pd.DataFrame({
    'EmployeeID': [1, 2, 3, 4],
    'Name': ['Alice', 'Bob', 'Charlie', 'David']
})
df2 = pd.DataFrame({
    'Employee': [3, 4, 5, 6],
    'Department': ['HR', 'IT', 'Finance', 'Marketing']
})

# innerjoin 
# inner join returns only the rows with matching keys in both DataFrames.
# outer join returns all rows from both DataFrames, filling in NaNs for missing matches on either side.
# innerdf = pd.merge(df1, df2, how="outer")  # for this to work there should be matching column in both dataframes
innerdf = pd.merge(df1, df2, left_on="EmployeeID", right_on="Employee", how="inner")
print(innerdf)
outer_df = pd.merge(df1, df2, left_on="EmployeeID", right_on="Employee", how="outer")
print(outer_df)

In [43]:
# left and right join
#left join works on all rows from left dataframe and matching rows from right dataframe
left_df = pd.merge(df1, df2, how="left", left_on="EmployeeID", right_on="Employee")
print(left_df)
# right join works on all rows from right dataframe and matching rows from left dataframe
right_df = pd.merge(df1, df2, how="right", left_on="EmployeeID", right_on="Employee")
print(right_df)

   EmployeeID     Name  Employee Department
0           1    Alice       NaN        NaN
1           2      Bob       NaN        NaN
2           3  Charlie       3.0         HR
3           4    David       4.0         IT
   EmployeeID     Name  Employee Department
0         3.0  Charlie         3         HR
1         4.0    David         4         IT
2         NaN      NaN         5    Finance
3         NaN      NaN         6  Marketing


In [48]:
# concatenation
df3 = pd.DataFrame({    
    'EmployeeID': [5, 6],
    'Name': ['Eve', 'Frank']
})
combined_df = pd.concat([df1, df3], ignore_index=True) # ignore index to reset the index in the combined dataframe
print(combined_df)
horizontal_df = pd.concat([df1, df2], axis=1) # axis=1 for horizontal concatenation
print(horizontal_df)

   EmployeeID     Name
0           1    Alice
1           2      Bob
2           3  Charlie
3           4    David
4           5      Eve
5           6    Frank
   EmployeeID     Name  Employee Department
0           1    Alice         3         HR
1           2      Bob         4         IT
2           3  Charlie         5    Finance
3           4    David         6  Marketing
