## Usecase of pandas
- Data Cleaning
- Data Analysis
- Data Transformation
- Data Visualization (Basic Level)
- Data Aggregation
- Data Handling
- Data Filtering & Selection
- Time Series analysis

In [53]:
# Series
# A series is a one dimentional labelled array capable of holding any data type. the axis are collectively called the index

In [54]:
import numpy as np
import pandas as pd

In [55]:
labels = ['a','b','c']
my_lst = [10,20,30]
arr = np.array([10,20,30])
dic = {1:10,2:20,3:30}

In [56]:
pd.Series(arr)

0    10
1    20
2    30
dtype: int64

In [57]:
pd.Series(dic)

1    10
2    20
3    30
dtype: int64

In [58]:
pd.Series(my_lst,index=labels)

a    10
b    20
c    30
dtype: int64

# DataFrames
- Creating dataframe
- Selection and indexing of cols 
- Creating new col 
- Removing cols
- selecting rows
- selecting subsets of rows and cols
- conditional selection

In [59]:
data = {
    "Name":['Shivam','Shahi','Rohan','Raghav','Ram'],
    "Age": [20,23,34,56,78],
    "City":["New York","Paris","Berlin","London","India"],
    "Salary":[65000,70000,62000,85000,np.nan]
}

df = pd.DataFrame(data)

In [60]:
df

Unnamed: 0,Name,Age,City,Salary
0,Shivam,20,New York,65000.0
1,Shahi,23,Paris,70000.0
2,Rohan,34,Berlin,62000.0
3,Raghav,56,London,85000.0
4,Ram,78,India,


In [61]:
df['Age'].astype(float)

0    20.0
1    23.0
2    34.0
3    56.0
4    78.0
Name: Age, dtype: float64

In [62]:
# selection of cols

df[['Age','Name']]

Unnamed: 0,Age,Name
0,20,Shivam
1,23,Shahi
2,34,Rohan
3,56,Raghav
4,78,Ram


In [63]:
df['Name']

0    Shivam
1     Shahi
2     Rohan
3    Raghav
4       Ram
Name: Name, dtype: object

In [64]:
# creating new cols

df['Designation'] = ['Data Scientist', "Devops","AI Engineer","Data Analyst","Doctor"]

In [65]:
df

Unnamed: 0,Name,Age,City,Salary,Designation
0,Shivam,20,New York,65000.0,Data Scientist
1,Shahi,23,Paris,70000.0,Devops
2,Rohan,34,Berlin,62000.0,AI Engineer
3,Raghav,56,London,85000.0,Data Analyst
4,Ram,78,India,,Doctor


In [66]:
# removing col

df = df.drop('Designation',axis=1)

In [67]:
df

Unnamed: 0,Name,Age,City,Salary
0,Shivam,20,New York,65000.0
1,Shahi,23,Paris,70000.0
2,Rohan,34,Berlin,62000.0
3,Raghav,56,London,85000.0
4,Ram,78,India,


In [68]:
# selection row
df

Unnamed: 0,Name,Age,City,Salary
0,Shivam,20,New York,65000.0
1,Shahi,23,Paris,70000.0
2,Rohan,34,Berlin,62000.0
3,Raghav,56,London,85000.0
4,Ram,78,India,


In [69]:
df.loc[4]

Name        Ram
Age          78
City      India
Salary      NaN
Name: 4, dtype: object

In [70]:
df.iloc[4]

Name        Ram
Age          78
City      India
Salary      NaN
Name: 4, dtype: object

In [71]:
# selecting subjects of rows and cols

df.loc[0][['City','Name']]

City    New York
Name      Shivam
Name: 0, dtype: object

In [72]:
df

Unnamed: 0,Name,Age,City,Salary
0,Shivam,20,New York,65000.0
1,Shahi,23,Paris,70000.0
2,Rohan,34,Berlin,62000.0
3,Raghav,56,London,85000.0
4,Ram,78,India,


In [73]:
df.loc[[2,3]][["Name",'Age']]

Unnamed: 0,Name,Age
2,Rohan,34
3,Raghav,56


In [74]:
# conditional selection

df2 = df.copy()

In [75]:
df2.fillna(50000,inplace=True)

In [76]:
# people whose age is above 30

df[df['Age']>30]

Unnamed: 0,Name,Age,City,Salary
2,Rohan,34,Berlin,62000.0
3,Raghav,56,London,85000.0
4,Ram,78,India,


#### MISSING DATA

In [77]:
data = {
    'A': [1, 2, np.nan, 4, 5],
    'B': [10, np.nan, 30, 40, 50],
    'C': [100, 200, 300, np.nan, 500]
}

df3 = pd.DataFrame(data)

In [78]:
df3

Unnamed: 0,A,B,C
0,1.0,10.0,100.0
1,2.0,,200.0
2,,30.0,300.0
3,4.0,40.0,
4,5.0,50.0,500.0


In [79]:
df3.isna().sum()

A    1
B    1
C    1
dtype: int64

In [80]:
df3.isna().any()

A    True
B    True
C    True
dtype: bool

In [81]:
# removing missing value

df3.dropna()

Unnamed: 0,A,B,C
0,1.0,10.0,100.0
4,5.0,50.0,500.0


In [82]:
df3.dropna(thresh=3)

Unnamed: 0,A,B,C
0,1.0,10.0,100.0
4,5.0,50.0,500.0


In [83]:
# Filling the missing data

In [84]:

df3

Unnamed: 0,A,B,C
0,1.0,10.0,100.0
1,2.0,,200.0
2,,30.0,300.0
3,4.0,40.0,
4,5.0,50.0,500.0


In [85]:
# filling all the missing value with 0 

df3.fillna(0)

Unnamed: 0,A,B,C
0,1.0,10.0,100.0
1,2.0,0.0,200.0
2,0.0,30.0,300.0
3,4.0,40.0,0.0
4,5.0,50.0,500.0


In [86]:
df3["D"] = [5,4,3,np.nan,np.nan]

In [87]:
df3

Unnamed: 0,A,B,C,D
0,1.0,10.0,100.0,5.0
1,2.0,,200.0,4.0
2,,30.0,300.0,3.0
3,4.0,40.0,,
4,5.0,50.0,500.0,


In [88]:
# filling different values for each cols

values ={"A":0,"B":100,"C":300,"D":400}
df3.fillna(value=values)

Unnamed: 0,A,B,C,D
0,1.0,10.0,100.0,5.0
1,2.0,100.0,200.0,4.0
2,0.0,30.0,300.0,3.0
3,4.0,40.0,300.0,400.0
4,5.0,50.0,500.0,400.0


In [89]:
df3.fillna(df3.mean())

Unnamed: 0,A,B,C,D
0,1.0,10.0,100.0,5.0
1,2.0,32.5,200.0,4.0
2,3.0,30.0,300.0,3.0
3,4.0,40.0,275.0,4.0
4,5.0,50.0,500.0,4.0


In [90]:
# merging , joining, concatinatin

In [91]:
import pandas as pd

df1 = pd.DataFrame({
    'emp_id': [101, 102, 103, 104],
    'name': ['Alice', 'Bob', 'Charlie', 'David'],
    'dept': ['HR', 'IT', 'Finance', 'Marketing']
})


df2 = pd.DataFrame({
    'emp_id': [103, 104, 105, 106],
    'salary': [70000, 80000, 60000, 50000],
    'location': ['Delhi', 'Mumbai', 'Bangalore', 'Chennai']
})



In [92]:
df1

Unnamed: 0,emp_id,name,dept
0,101,Alice,HR
1,102,Bob,IT
2,103,Charlie,Finance
3,104,David,Marketing


In [93]:
df2

Unnamed: 0,emp_id,salary,location
0,103,70000,Delhi
1,104,80000,Mumbai
2,105,60000,Bangalore
3,106,50000,Chennai


In [94]:
# MERGING 
pd.merge(df1,df2, on='emp_id',how= "outer")

Unnamed: 0,emp_id,name,dept,salary,location
0,101,Alice,HR,,
1,102,Bob,IT,,
2,103,Charlie,Finance,70000.0,Delhi
3,104,David,Marketing,80000.0,Mumbai
4,105,,,60000.0,Bangalore
5,106,,,50000.0,Chennai


Capstone Project

In [95]:
df = pd.read_csv('Countries.csv')

In [96]:
df.shape

(194, 64)

In [97]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194 entries, 0 to 193
Data columns (total 64 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   country                                  194 non-null    object 
 1   country_long                             194 non-null    object 
 2   currency                                 194 non-null    object 
 3   capital_city                             194 non-null    object 
 4   region                                   194 non-null    object 
 5   continent                                194 non-null    object 
 6   demonym                                  194 non-null    object 
 7   latitude                                 194 non-null    float64
 8   longitude                                194 non-null    float64
 9   agricultural_land                        193 non-null    float64
 10  forest_area                              194 non-n

In [98]:
df.isnull().sum()

country             0
country_long        0
currency            0
capital_city        0
region              0
                   ..
democracy_score     0
democracy_type      0
median_age          0
political_leader    7
title               7
Length: 64, dtype: int64

In [99]:
df.describe()

Unnamed: 0,latitude,longitude,agricultural_land,forest_area,land_area,rural_land,urban_land,central_government_debt_pct_gdp,expense_pct_gdp,gdp,...,net_migration,population_female,population_male,population,women_parliament_seats_pct,rural_population,urban_population,press,democracy_score,median_age
count,194.0,194.0,193.0,194.0,194.0,194.0,194.0,120.0,156.0,193.0,...,194.0,194.0,194.0,194.0,193.0,194.0,194.0,194.0,194.0,194.0
mean,18.975601,22.027491,245455.1,208678.4,667508.7,656371.1,9777.116531,66.759366,30.051403,514485100000.0,...,-51.407216,20283160.0,20505720.0,40788880.0,25.022994,17633220.0,23155660.0,2.53933,4.644536,25.661856
std,23.876225,66.396389,635626.8,782492.6,1837107.0,1811169.0,42301.458421,71.806247,26.74088,2307148000000.0,...,94525.968598,72589410.0,76071580.0,148647000.0,12.671044,76641870.0,79403930.0,1.800128,2.818297,9.415569
min,-41.0,-175.0,4.0,0.0,2.027,0.0349545,0.0,0.0,0.000267,60349400.0,...,-525116.0,5513.0,5799.0,11312.0,0.0,0.0,5717.0,0.0,0.0,10.5
25%,4.0,-5.0,6464.0,3331.775,23552.5,21865.62,359.61825,31.9513,18.371875,11813900000.0,...,-12242.25,1036218.0,1044902.0,2106358.0,15.3846,589664.0,1222244.0,1.525,2.7225,16.95
50%,16.583333,21.5,38727.8,25289.25,120375.0,115994.5,1645.17,55.42685,27.33775,41153900000.0,...,-970.0,4502713.0,4450049.0,9125614.0,25.2525,2512382.0,4508837.0,2.4,5.05,24.95
75%,40.0,50.1625,215000.0,123673.5,523700.0,491150.5,4054.0225,79.5393,35.0835,251945000000.0,...,2904.25,15266060.0,14788720.0,30313610.0,33.6364,11333540.0,16213550.0,2.925,6.9675,34.05
max,65.0,178.0,5285080.0,8153120.0,16376900.0,16224200.0,522345.0,687.994,310.443,25462700000000.0,...,561580.0,691528500.0,731180500.0,1417173000.0,61.25,908804800.0,897578400.0,10.0,9.87,50.5


In [104]:
df.head()

Unnamed: 0,country,country_long,currency,capital_city,region,continent,demonym,latitude,longitude,agricultural_land,...,population,women_parliament_seats_pct,rural_population,urban_population,press,democracy_score,democracy_type,median_age,political_leader,title
0,Afghanistan,Islamic State of Afghanistan,Afghan afghani,Kabul,Southern Asia,Asia,Afghan,33.0,65.0,383560.0,...,41128771,27.0161,30181937,10946834,2.14,2.97,Authoritarian,12.9,Ashraf Ghani,President
1,Albania,Republic of Albania,Albanian lek,Tirana,Southern Europe,Europe,Albanian,41.0,20.0,11655.5,...,2775634,35.7143,1004807,1770827,2.62,5.98,Hybrid regime,33.7,Edi Rama,Prime Minister
2,Algeria,People's Democratic Republic of Algeria,Algerian dinar,Algiers,Northern Africa,Africa,Algerian,28.0,3.0,413588.0,...,44903225,8.10811,11328186,33575039,1.71,3.5,Authoritarian,24.0,Abdelmadjid Tebboune,President
3,Andorra,Principality of Andorra,Euro,Andorra la Vella,Southern Europe,Europe,Andorran,42.5,1.5,187.2,...,79824,46.4286,9730,70094,3.17,0.0,Unknown,38.9,Xavier Espot Zamora,Head of Government
4,Angola,People's Republic of Angola,Angolan kwanza,Luanda,Middle Africa,Africa,Angolan,-12.5,18.5,569525.0,...,35588987,33.6364,11359649,24229338,2.24,3.62,Authoritarian,12.4,João Lourenço,President


In [102]:
df[df['population'] == df['population'].max()]['country']

75    India
Name: country, dtype: object

In [106]:
# what is the capital of the country with the hightest population
df[df['population'] == df['population'].max()]['capital_city']

75    New Delhi
Name: capital_city, dtype: object

In [107]:
# which country has the least population
df[df['population'] == df['population'].min()]['country']

179    Tuvalu
Name: country, dtype: object

In [108]:
# what is the capital of the country with least population

df[df['population'] == df['population'].min()]['capital_city']

179    Funafuti
Name: capital_city, dtype: object

In [109]:
df.head()

Unnamed: 0,country,country_long,currency,capital_city,region,continent,demonym,latitude,longitude,agricultural_land,...,population,women_parliament_seats_pct,rural_population,urban_population,press,democracy_score,democracy_type,median_age,political_leader,title
0,Afghanistan,Islamic State of Afghanistan,Afghan afghani,Kabul,Southern Asia,Asia,Afghan,33.0,65.0,383560.0,...,41128771,27.0161,30181937,10946834,2.14,2.97,Authoritarian,12.9,Ashraf Ghani,President
1,Albania,Republic of Albania,Albanian lek,Tirana,Southern Europe,Europe,Albanian,41.0,20.0,11655.5,...,2775634,35.7143,1004807,1770827,2.62,5.98,Hybrid regime,33.7,Edi Rama,Prime Minister
2,Algeria,People's Democratic Republic of Algeria,Algerian dinar,Algiers,Northern Africa,Africa,Algerian,28.0,3.0,413588.0,...,44903225,8.10811,11328186,33575039,1.71,3.5,Authoritarian,24.0,Abdelmadjid Tebboune,President
3,Andorra,Principality of Andorra,Euro,Andorra la Vella,Southern Europe,Europe,Andorran,42.5,1.5,187.2,...,79824,46.4286,9730,70094,3.17,0.0,Unknown,38.9,Xavier Espot Zamora,Head of Government
4,Angola,People's Republic of Angola,Angolan kwanza,Luanda,Middle Africa,Africa,Angolan,-12.5,18.5,569525.0,...,35588987,33.6364,11359649,24229338,2.24,3.62,Authoritarian,12.4,João Lourenço,President


In [115]:
# Find the top 5 countries with highest democratic score
top_5 = df.sort_values(by= 'democracy_score', ascending= False, ignore_index= True)


In [116]:
top_5.loc[0:5]

Unnamed: 0,country,country_long,currency,capital_city,region,continent,demonym,latitude,longitude,agricultural_land,...,population,women_parliament_seats_pct,rural_population,urban_population,press,democracy_score,democracy_type,median_age,political_leader,title
0,Norway,Kingdom of Norway,Norwegian krone,Oslo,Northern Europe,Europe,Norwegian,62.0,10.0,9859.62,...,5457127,44.9704,891476,4565651,10.0,9.87,Full democracy,35.6,Erna Solberg,Prime Minister
1,Iceland,Republic of Iceland,Iceland krona,Reykjavík,Northern Europe,Europe,Icelander,65.0,-18.0,18720.0,...,381900,47.619,22945,358955,5.32,9.58,Full democracy,32.1,Katrín Jakobsdóttir,Prime Minister
2,Sweden,Kingdom of Sweden,Swedish krona,Stockholm,Northern Europe,Europe,Swedish,62.0,15.0,30055.4,...,10486941,46.4183,1206837,9280104,9.41,9.39,Full democracy,35.6,Stefan Löfven,Prime Minister
3,New Zealand,New Zealand,New Zealand dollar,Wellington,Australia and New Zealand,Oceania,New Zealander,-41.0,174.0,101540.0,...,5124100,50.4202,672077,4452023,7.27,9.26,Full democracy,32.8,Jacinda Ardern,Prime Minister
4,Denmark,Kingdom of Denmark,Danish krone,Copenhagen,Northern Europe,Europe,Danish,56.0,10.0,26199.9,...,5903037,43.5754,686700,5216337,7.92,9.22,Full democracy,37.2,Mette Frederiksen,Prime Minister
5,Ireland,Ireland,Euro,Dublin,Northern Europe,Europe,Irish,53.0,-8.0,45120.0,...,5086988,23.125,1822006,3264982,5.21,9.15,Full democracy,33.9,Leo Varadkar,Taoiseach


In [121]:
# How many total regions are there
total_region = len(df['region'].value_counts())

total_region

22

In [125]:
# how many countries llie in eastern europe region
df['region'].value_counts()['Eastern Europe']


np.int64(10)

In [128]:
df[df['region']== 'Eastern Europe']['country']

14             Belarus
24            Bulgaria
43      Czech Republic
73             Hungary
111            Moldova
136             Poland
139            Romania
140             Russia
151    Slovak Republic
181            Ukraine
Name: country, dtype: object

In [137]:
# who is the political leader of the 2nd highest populated country

df[df['population'] == df['population'].nlargest(2).iloc[1]]

Unnamed: 0,country,country_long,currency,capital_city,region,continent,demonym,latitude,longitude,agricultural_land,...,population,women_parliament_seats_pct,rural_population,urban_population,press,democracy_score,democracy_type,median_age,political_leader,title
34,China,People's Republic of China,Chinese yuan,Beijing,Eastern Asia,Asia,Chinese,35.0,105.0,5285080.0,...,1412175000,24.9412,514596570,897578430,0.99,3.32,Authoritarian,34.5,Xi Jinping,General Secretary of the Communist Party
