In [44]:
import pandas as pd
import numpy as np

In [45]:
# DataFrame

data = {
    "Name": ['Alice', 'Bob', 'Charlie', 'David', "Eve", 'Alice'],
    'Age':[25, 30, 35, np.nan, 29, 25],
    'Department': [ 'HR', 'IT', 'Finance', 'IT', "HR", "HR"],
    "Salary":[50000, 60000, 70000, 62000, np.nan, 50000]
}

In [46]:
data

{'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Alice'],
 'Age': [25, 30, 35, nan, 29, 25],
 'Department': ['HR', 'IT', 'Finance', 'IT', 'HR', 'HR'],
 'Salary': [50000, 60000, 70000, 62000, nan, 50000]}

In [47]:
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Department,Salary
0,Alice,25.0,HR,50000.0
1,Bob,30.0,IT,60000.0
2,Charlie,35.0,Finance,70000.0
3,David,,IT,62000.0
4,Eve,29.0,HR,
5,Alice,25.0,HR,50000.0


In [48]:
df.head(2)

Unnamed: 0,Name,Age,Department,Salary
0,Alice,25.0,HR,50000.0
1,Bob,30.0,IT,60000.0


In [49]:
df.tail(2)

Unnamed: 0,Name,Age,Department,Salary
4,Eve,29.0,HR,
5,Alice,25.0,HR,50000.0


In [50]:
# loc & iloc

df.iloc[1:3, :2]

Unnamed: 0,Name,Age
1,Bob,30.0
2,Charlie,35.0


In [51]:
df.loc[1:3, ['Age','Department']]

Unnamed: 0,Age,Department
1,30.0,IT
2,35.0,Finance
3,,IT


In [52]:
df.drop("Age", axis=1)

Unnamed: 0,Name,Department,Salary
0,Alice,HR,50000.0
1,Bob,IT,60000.0
2,Charlie,Finance,70000.0
3,David,IT,62000.0
4,Eve,HR,
5,Alice,HR,50000.0


In [53]:
df

Unnamed: 0,Name,Age,Department,Salary
0,Alice,25.0,HR,50000.0
1,Bob,30.0,IT,60000.0
2,Charlie,35.0,Finance,70000.0
3,David,,IT,62000.0
4,Eve,29.0,HR,
5,Alice,25.0,HR,50000.0


In [54]:
df[['Age','Department']]

Unnamed: 0,Age,Department
0,25.0,HR
1,30.0,IT
2,35.0,Finance
3,,IT
4,29.0,HR
5,25.0,HR


In [55]:
df.shape

(6, 4)

In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Name        6 non-null      object 
 1   Age         5 non-null      float64
 2   Department  6 non-null      object 
 3   Salary      5 non-null      float64
dtypes: float64(2), object(2)
memory usage: 324.0+ bytes


In [57]:
df.describe()

Unnamed: 0,Age,Salary
count,5.0,5.0
mean,28.8,58400.0
std,4.147288,8532.291603
min,25.0,50000.0
25%,25.0,50000.0
50%,29.0,60000.0
75%,30.0,62000.0
max,35.0,70000.0


In [58]:
# Broadcasting

df['Salary'] = df['Salary'] + 5000

In [59]:
df['Salary']

0    55000.0
1    65000.0
2    75000.0
3    67000.0
4        NaN
5    55000.0
Name: Salary, dtype: float64

In [60]:
# Renaming Columns

df.rename(columns = {'Department': 'Dept'}, inplace= True)

In [61]:
# Unique values

df['Dept'].unique()

array(['HR', 'IT', 'Finance'], dtype=object)

In [62]:
df['Salary'].unique()

array([55000., 65000., 75000., 67000.,    nan])

In [63]:
# Value counts

df['Dept'].value_counts()

Dept
HR         3
IT         2
Finance    1
Name: count, dtype: int64

In [64]:
df['Promoted Salary'] = df['Salary'] * 10

In [65]:
df

Unnamed: 0,Name,Age,Dept,Salary,Promoted Salary
0,Alice,25.0,HR,55000.0,550000.0
1,Bob,30.0,IT,65000.0,650000.0
2,Charlie,35.0,Finance,75000.0,750000.0
3,David,,IT,67000.0,670000.0
4,Eve,29.0,HR,,
5,Alice,25.0,HR,55000.0,550000.0


In [66]:
# Data cleaning

df.isnull().sum()

Name               0
Age                1
Dept               0
Salary             1
Promoted Salary    1
dtype: int64

In [67]:
df.isna().sum()

Name               0
Age                1
Dept               0
Salary             1
Promoted Salary    1
dtype: int64

In [75]:
# Drop null values

df.dropna(how='all')

Unnamed: 0,Name,Age,Dept,Salary,Promoted Salary
0,Alice,25.0,HR,55000.0,550000.0
1,Bob,30.0,IT,65000.0,650000.0
2,Charlie,35.0,Finance,75000.0,750000.0
3,David,,IT,67000.0,670000.0
4,Eve,29.0,HR,,
5,Alice,25.0,HR,55000.0,550000.0


In [69]:
df.dropna()

Unnamed: 0,Name,Age,Dept,Salary,Promoted Salary
0,Alice,25.0,HR,55000.0,550000.0
1,Bob,30.0,IT,65000.0,650000.0
2,Charlie,35.0,Finance,75000.0,750000.0
5,Alice,25.0,HR,55000.0,550000.0


In [76]:
df['Age'].fillna(round(df['Age'].mean()))

0    25.0
1    30.0
2    35.0
3    29.0
4    29.0
5    25.0
Name: Age, dtype: float64

In [80]:
df['Name']=df['Name'].replace('Charlie',' Rose')
df

Unnamed: 0,Name,Age,Dept,Salary,Promoted Salary
0,Alice,25.0,HR,55000.0,550000.0
1,Bob,30.0,IT,65000.0,650000.0
2,Rose,35.0,Finance,75000.0,750000.0
3,David,,IT,67000.0,670000.0
4,Eve,29.0,HR,,
5,Alice,25.0,HR,55000.0,550000.0


In [83]:
# Check for Duplicates

df[df.duplicated(keep='last')]

Unnamed: 0,Name,Age,Dept,Salary,Promoted Salary
0,Alice,25.0,HR,55000.0,550000.0


In [87]:
df = df.drop_duplicates()
df

Unnamed: 0,Name,Age,Dept,Salary,Promoted Salary
0,Alice,25.0,HR,55000.0,550000.0
1,Bob,30.0,IT,65000.0,650000.0
2,Rose,35.0,Finance,75000.0,750000.0
3,David,,IT,67000.0,670000.0
4,Eve,29.0,HR,,


In [88]:
# invalid values:
# Lambda - > python

df['Promoted Salary'] = df['Promoted Salary'].apply(lambda x: x/10 if x > 650000 else x)


In [89]:
df

Unnamed: 0,Name,Age,Dept,Salary,Promoted Salary
0,Alice,25.0,HR,55000.0,550000.0
1,Bob,30.0,IT,65000.0,650000.0
2,Rose,35.0,Finance,75000.0,75000.0
3,David,,IT,67000.0,67000.0
4,Eve,29.0,HR,,


In [102]:
# Joins and Merges

department_info ={
    "Dept": ['HR', 'IT', 'Finance'],
    'Location':['New York', ' San Francisco', 'Chicago'],
    'Manager':['Luara', 'Steve', 'Nina']
}


df2 = pd.DataFrame(department_info)
df2


Unnamed: 0,Dept,Location,Manager
0,HR,New York,Luara
1,IT,San Francisco,Steve
2,Finance,Chicago,Nina


In [99]:
pd.concat([df, df2])

Unnamed: 0,Name,Age,Dept,Salary,Promoted Salary,dept,Location,Manager
0,Alice,25.0,HR,55000.0,550000.0,,,
1,Bob,30.0,IT,65000.0,650000.0,,,
2,Rose,35.0,Finance,75000.0,75000.0,,,
3,David,,IT,67000.0,67000.0,,,
4,Eve,29.0,HR,,,,,
0,,,,,,HR,New York,Luara
1,,,,,,IT,San Francisco,Steve
2,,,,,,Finance,Chicago,Nina


In [100]:
pd.concat([df, df2], axis=1)

Unnamed: 0,Name,Age,Dept,Salary,Promoted Salary,dept,Location,Manager
0,Alice,25.0,HR,55000.0,550000.0,HR,New York,Luara
1,Bob,30.0,IT,65000.0,650000.0,IT,San Francisco,Steve
2,Rose,35.0,Finance,75000.0,75000.0,Finance,Chicago,Nina
3,David,,IT,67000.0,67000.0,,,
4,Eve,29.0,HR,,,,,


In [103]:
#merge

pd.merge(df, df2, on="Dept")

Unnamed: 0,Name,Age,Dept,Salary,Promoted Salary,Location,Manager
0,Alice,25.0,HR,55000.0,550000.0,New York,Luara
1,Bob,30.0,IT,65000.0,650000.0,San Francisco,Steve
2,Rose,35.0,Finance,75000.0,75000.0,Chicago,Nina
3,David,,IT,67000.0,67000.0,San Francisco,Steve
4,Eve,29.0,HR,,,New York,Luara


Testing Code

In [None]:
print("hello world")