## Usecase of pandas
- Data Cleaning
- Data Analysis
- Data Transformation
- Data Visualization (Basic Level)
- Data Aggregation
- Data Handling
- Data Filtering & Selection
- Time Series analysis

In [65]:
# Series
# A series is a one dimentional labelled array capable of holding any data type. the axis are collectively called the index

In [66]:
import numpy as np
import pandas as pd

In [67]:
labels = ['a','b','c']
my_lst = [10,20,30]
arr = np.array([10,20,30])
dic = {1:10,2:20,3:30}

In [68]:
pd.Series(arr)

0    10
1    20
2    30
dtype: int64

In [69]:
pd.Series(dic)

1    10
2    20
3    30
dtype: int64

In [70]:
pd.Series(my_lst,index=labels)

a    10
b    20
c    30
dtype: int64

# DataFrames
- Creating dataframe
- Selection and indexing of cols 
- Creating new col 
- Removing cols
- selecting rows
- selecting subsets of rows and cols
- conditional selection

In [71]:
data = {
    "Name":['Shivam','Shahi','Rohan','Raghav','Ram'],
    "Age": [20,23,34,56,78],
    "City":["New York","Paris","Berlin","London","India"],
    "Salary":[65000,70000,62000,85000,np.nan]
}

df = pd.DataFrame(data)

In [72]:
df

Unnamed: 0,Name,Age,City,Salary
0,Shivam,20,New York,65000.0
1,Shahi,23,Paris,70000.0
2,Rohan,34,Berlin,62000.0
3,Raghav,56,London,85000.0
4,Ram,78,India,


In [73]:
df['Age'].astype(float)

0    20.0
1    23.0
2    34.0
3    56.0
4    78.0
Name: Age, dtype: float64

In [74]:
# selection of cols

df[['Age','Name']]

Unnamed: 0,Age,Name
0,20,Shivam
1,23,Shahi
2,34,Rohan
3,56,Raghav
4,78,Ram


In [75]:
df['Name']

0    Shivam
1     Shahi
2     Rohan
3    Raghav
4       Ram
Name: Name, dtype: object

In [76]:
# creating new cols

df['Designation'] = ['Data Scientist', "Devops","AI Engineer","Data Analyst","Doctor"]

In [77]:
df

Unnamed: 0,Name,Age,City,Salary,Designation
0,Shivam,20,New York,65000.0,Data Scientist
1,Shahi,23,Paris,70000.0,Devops
2,Rohan,34,Berlin,62000.0,AI Engineer
3,Raghav,56,London,85000.0,Data Analyst
4,Ram,78,India,,Doctor


In [78]:
# removing col

df = df.drop('Designation',axis=1)

In [79]:
df

Unnamed: 0,Name,Age,City,Salary
0,Shivam,20,New York,65000.0
1,Shahi,23,Paris,70000.0
2,Rohan,34,Berlin,62000.0
3,Raghav,56,London,85000.0
4,Ram,78,India,


In [80]:
# selection row
df

Unnamed: 0,Name,Age,City,Salary
0,Shivam,20,New York,65000.0
1,Shahi,23,Paris,70000.0
2,Rohan,34,Berlin,62000.0
3,Raghav,56,London,85000.0
4,Ram,78,India,


In [81]:
df.loc[4]

Name        Ram
Age          78
City      India
Salary      NaN
Name: 4, dtype: object

In [82]:
df.iloc[4]

Name        Ram
Age          78
City      India
Salary      NaN
Name: 4, dtype: object

In [83]:
# selecting subjects of rows and cols

df.loc[0][['City','Name']]

City    New York
Name      Shivam
Name: 0, dtype: object

In [84]:
df

Unnamed: 0,Name,Age,City,Salary
0,Shivam,20,New York,65000.0
1,Shahi,23,Paris,70000.0
2,Rohan,34,Berlin,62000.0
3,Raghav,56,London,85000.0
4,Ram,78,India,


In [85]:
df.loc[[2,3]][["Name",'Age']]

Unnamed: 0,Name,Age
2,Rohan,34
3,Raghav,56


In [86]:
# conditional selection

df2 = df.copy()

In [87]:
df2.fillna(50000,inplace=True)

In [88]:
# people whose age is above 30

df[df['Age']>30]

Unnamed: 0,Name,Age,City,Salary
2,Rohan,34,Berlin,62000.0
3,Raghav,56,London,85000.0
4,Ram,78,India,


#### MISSING DATA

In [89]:
data = {
    'A': [1, 2, np.nan, 4, 5],
    'B': [10, np.nan, 30, 40, 50],
    'C': [100, 200, 300, np.nan, 500]
}

df3 = pd.DataFrame(data)

In [90]:
df3

Unnamed: 0,A,B,C
0,1.0,10.0,100.0
1,2.0,,200.0
2,,30.0,300.0
3,4.0,40.0,
4,5.0,50.0,500.0


In [91]:
df3.isna().sum()

A    1
B    1
C    1
dtype: int64

In [92]:
df3.isna().any()

A    True
B    True
C    True
dtype: bool

In [93]:
# removing missing value

df3.dropna()

Unnamed: 0,A,B,C
0,1.0,10.0,100.0
4,5.0,50.0,500.0


In [94]:
df3.dropna(thresh=3)

Unnamed: 0,A,B,C
0,1.0,10.0,100.0
4,5.0,50.0,500.0


In [95]:
# Filling the missing data

In [96]:

df3

Unnamed: 0,A,B,C
0,1.0,10.0,100.0
1,2.0,,200.0
2,,30.0,300.0
3,4.0,40.0,
4,5.0,50.0,500.0


In [97]:
# filling all the missing value with 0 

df3.fillna(0)

Unnamed: 0,A,B,C
0,1.0,10.0,100.0
1,2.0,0.0,200.0
2,0.0,30.0,300.0
3,4.0,40.0,0.0
4,5.0,50.0,500.0


In [98]:
df3["D"] = [5,4,3,np.nan,np.nan]

In [99]:
df3

Unnamed: 0,A,B,C,D
0,1.0,10.0,100.0,5.0
1,2.0,,200.0,4.0
2,,30.0,300.0,3.0
3,4.0,40.0,,
4,5.0,50.0,500.0,


In [100]:
# filling different values for each cols

values ={"A":0,"B":100,"C":300,"D":400}
df3.fillna(value=values)

Unnamed: 0,A,B,C,D
0,1.0,10.0,100.0,5.0
1,2.0,100.0,200.0,4.0
2,0.0,30.0,300.0,3.0
3,4.0,40.0,300.0,400.0
4,5.0,50.0,500.0,400.0


In [103]:
df3.fillna(df3.mean())

Unnamed: 0,A,B,C,D
0,1.0,10.0,100.0,5.0
1,2.0,32.5,200.0,4.0
2,3.0,30.0,300.0,3.0
3,4.0,40.0,275.0,4.0
4,5.0,50.0,500.0,4.0


In [104]:
# merging , joining, concatinatin

In [105]:
import pandas as pd

df1 = pd.DataFrame({
    'emp_id': [101, 102, 103, 104],
    'name': ['Alice', 'Bob', 'Charlie', 'David'],
    'dept': ['HR', 'IT', 'Finance', 'Marketing']
})


df2 = pd.DataFrame({
    'emp_id': [103, 104, 105, 106],
    'salary': [70000, 80000, 60000, 50000],
    'location': ['Delhi', 'Mumbai', 'Bangalore', 'Chennai']
})



In [106]:
df1

Unnamed: 0,emp_id,name,dept
0,101,Alice,HR
1,102,Bob,IT
2,103,Charlie,Finance
3,104,David,Marketing


In [107]:
df2

Unnamed: 0,emp_id,salary,location
0,103,70000,Delhi
1,104,80000,Mumbai
2,105,60000,Bangalore
3,106,50000,Chennai


In [112]:
# MERGING 
pd.merge(df1,df2, on='emp_id',how= "outer")

Unnamed: 0,emp_id,name,dept,salary,location
0,101,Alice,HR,,
1,102,Bob,IT,,
2,103,Charlie,Finance,70000.0,Delhi
3,104,David,Marketing,80000.0,Mumbai
4,105,,,60000.0,Bangalore
5,106,,,50000.0,Chennai
