# Pandas Basics

In [1]:
import pandas as pd

<h2>1. Series

In [2]:
# series is an array underhood
# syntax :: pd.Series(data=None,index=None,dtype: 'Dtype | None' = None,name=None,copy: 'bool' = False, fastpath: 'bool' = False,)
series = pd.Series([1,2,12,14,25],index = ['one','two','three', 'four','five'],name='simple')
print(type(series),'\n')

# list of list series
ls_series = pd.Series([[1,2],[2,3],[1,3,4,5]],index = ['one','two','three'],name='ls simple')
print(ls_series.values[0])
print(ls_series.values[2][1:]) #cool

<class 'pandas.core.series.Series'> 

[1, 2]
[3, 4, 5]


In [3]:
print(series,"\n")
# series values
print(type(series.values),series.values,"\n") # prints the ndarray of this series, type is numpy.ndarray
# Series indices
print(series.index,'\n')
#min 
print('min = ',series.min())
#max
print('max = ',series.max())

one       1
two       2
three    12
four     14
five     25
Name: simple, dtype: int64 

<class 'numpy.ndarray'> [ 1  2 12 14 25] 

Index(['one', 'two', 'three', 'four', 'five'], dtype='object') 

min =  1
max =  25


<h5> Conditional retrieval

In [4]:
print(series>2.6,'\n') # return bool series indicating conditional matches
print(series[(series>2) & (series<22)])

one      False
two      False
three     True
four      True
five      True
Name: simple, dtype: bool 

three    12
four     14
Name: simple, dtype: int64


In [5]:
# using data as list throws error if length mismatch occurs between values and indices
data = [1,2,3,4,5] # Data must be 1-dimensional
indices = ['one','two','three', 'four','five']
list_series = pd.Series(data,index = indices,name='list series')
print(list_series,"\n")

# using data as dict wont throw error if length mismatch occurs between values and indices
# rather as new type is introduced , NaN
#data = pd.Series({'temp1':1,'temp2':2,'temp3':[1,2,3]}) ERROR as Series is an array and "int + sequence" not allowed
data = {'india':1,'usa':2,'temp3':3}
indices = ['india','usa','france','china']
dict_series = pd.Series(data, index=indices, name = 'dict series')
print(dict_series)

one      1
two      2
three    3
four     4
five     5
Name: list series, dtype: int64 

india     1.0
usa       2.0
france    NaN
china     NaN
Name: dict series, dtype: float64


In [6]:
# when a dictionary is passed as data, dict_keys are used as indices
id = {'india':1,'usa':2,'temp3':3}
countries = ['india','usa','france','china']
country_series = pd.Series(id, index=countries, name = 'country series')
display(country_series)
print(country_series.isna(),"\n") # produces boolean series
print(country_series.notna()) # produces boolean series

india     1.0
usa       2.0
france    NaN
china     NaN
Name: country series, dtype: float64

india     False
usa       False
france     True
china      True
Name: country series, dtype: bool 

india      True
usa        True
france    False
china     False
Name: country series, dtype: bool


In [7]:
print(country_series[country_series.isna()],"\n")
print(country_series[country_series.notna()]) # fetching series by provideing a series
# NaN denotes missing values

france   NaN
china    NaN
Name: country series, dtype: float64 

india    1.0
usa      2.0
Name: country series, dtype: float64


<h1> DataFrames

In [8]:
dic_data = {'Year':(1997,1998,1999,2000,2001),
            'Population':(2000,4500,4600,3900,1000),
           'Income':(450, 300,400,500,390),
           'States':('Kerala', 'Tamil Nadu', 'Andhra Pradesh', 'Karnataka', 'Goa')}
display(dic_data)

{'Year': (1997, 1998, 1999, 2000, 2001),
 'Population': (2000, 4500, 4600, 3900, 1000),
 'Income': (450, 300, 400, 500, 390),
 'States': ('Kerala', 'Tamil Nadu', 'Andhra Pradesh', 'Karnataka', 'Goa')}

In [9]:
pop_data_frame = pd.DataFrame(dic_data)
display(pop_data_frame)

Unnamed: 0,Year,Population,Income,States
0,1997,2000,450,Kerala
1,1998,4500,300,Tamil Nadu
2,1999,4600,400,Andhra Pradesh
3,2000,3900,500,Karnataka
4,2001,1000,390,Goa


In [10]:
# to fetch index
pop_data_frame.set_index('States',inplace = True) # Set index function
display(pop_data_frame)

Unnamed: 0_level_0,Year,Population,Income
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Kerala,1997,2000,450
Tamil Nadu,1998,4500,300
Andhra Pradesh,1999,4600,400
Karnataka,2000,3900,500
Goa,2001,1000,390


In [11]:
pop_data_frame['Debt'] = (100,80,110,126,287.7)
display(pop_data_frame)
display(pop_data_frame.iloc[2]) #printing indices

Unnamed: 0_level_0,Year,Population,Income,Debt
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Kerala,1997,2000,450,100.0
Tamil Nadu,1998,4500,300,80.0
Andhra Pradesh,1999,4600,400,110.0
Karnataka,2000,3900,500,126.0
Goa,2001,1000,390,287.7


Year          1999.0
Population    4600.0
Income         400.0
Debt           110.0
Name: Andhra Pradesh, dtype: float64

In [12]:
pop_data_frame['Savings'] = pop_data_frame['Income'] - pop_data_frame['Debt']
display(pop_data_frame)

Unnamed: 0_level_0,Year,Population,Income,Debt,Savings
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Kerala,1997,2000,450,100.0,350.0
Tamil Nadu,1998,4500,300,80.0,220.0
Andhra Pradesh,1999,4600,400,110.0,290.0
Karnataka,2000,3900,500,126.0,374.0
Goa,2001,1000,390,287.7,102.3


In [13]:
print(type(pop_data_frame)) #Class
print(pop_data_frame.size) #Size
print(pop_data_frame.ndim) #dimension
display(pop_data_frame.values) #values of a DF prints underhood 2D array

<class 'pandas.core.frame.DataFrame'>
25
2


array([[1997. , 2000. ,  450. ,  100. ,  350. ],
       [1998. , 4500. ,  300. ,   80. ,  220. ],
       [1999. , 4600. ,  400. ,  110. ,  290. ],
       [2000. , 3900. ,  500. ,  126. ,  374. ],
       [2001. , 1000. ,  390. ,  287.7,  102.3]])

In [14]:
# New vehicle data frame
veh_df = pd.DataFrame(pop_data_frame, columns = ('Year','Population','Income','Savings','Vehicles')) 
display(veh_df)

Unnamed: 0_level_0,Year,Population,Income,Savings,Vehicles
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Kerala,1997,2000,450,350.0,
Tamil Nadu,1998,4500,300,220.0,
Andhra Pradesh,1999,4600,400,290.0,
Karnataka,2000,3900,500,374.0,
Goa,2001,1000,390,102.3,


In [15]:
veh_df['Vehicles'] = ('3000','5000','5000','4000','6000')
display(veh_df)

Unnamed: 0_level_0,Year,Population,Income,Savings,Vehicles
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Kerala,1997,2000,450,350.0,3000
Tamil Nadu,1998,4500,300,220.0,5000
Andhra Pradesh,1999,4600,400,290.0,5000
Karnataka,2000,3900,500,374.0,4000
Goa,2001,1000,390,102.3,6000


In [16]:
veh_df['test_del']=(1,2,3,4,5) # to be deleted
veh_df['test_drop']=(1,2,3,4,5) # to be droped
display(veh_df)
# Deleting
del veh_df['test_del']
display(veh_df)
# Droping
veh_df.drop('test_drop',axis = 1, inplace = True) #by default axis = 0 or indices or row
display(veh_df)
#removing row or index GOA
veh_df.drop('Goa', axis = 0, inplace = True)
display(veh_df)

Unnamed: 0_level_0,Year,Population,Income,Savings,Vehicles,test_del,test_drop
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Kerala,1997,2000,450,350.0,3000,1,1
Tamil Nadu,1998,4500,300,220.0,5000,2,2
Andhra Pradesh,1999,4600,400,290.0,5000,3,3
Karnataka,2000,3900,500,374.0,4000,4,4
Goa,2001,1000,390,102.3,6000,5,5


Unnamed: 0_level_0,Year,Population,Income,Savings,Vehicles,test_drop
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Kerala,1997,2000,450,350.0,3000,1
Tamil Nadu,1998,4500,300,220.0,5000,2
Andhra Pradesh,1999,4600,400,290.0,5000,3
Karnataka,2000,3900,500,374.0,4000,4
Goa,2001,1000,390,102.3,6000,5


Unnamed: 0_level_0,Year,Population,Income,Savings,Vehicles
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Kerala,1997,2000,450,350.0,3000
Tamil Nadu,1998,4500,300,220.0,5000
Andhra Pradesh,1999,4600,400,290.0,5000
Karnataka,2000,3900,500,374.0,4000
Goa,2001,1000,390,102.3,6000


Unnamed: 0_level_0,Year,Population,Income,Savings,Vehicles
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Kerala,1997,2000,450,350.0,3000
Tamil Nadu,1998,4500,300,220.0,5000
Andhra Pradesh,1999,4600,400,290.0,5000
Karnataka,2000,3900,500,374.0,4000


<h1> Importing datasets

<h3>1. CSV

In [17]:
vh_csv_df = pd.read_csv('./datasets/auto-mpg.csv')
display(vh_csv_df) #not full rows are displayed
#Display full data
#display(csv_df.to_markdown())
#display(csv_df.to_markdown()) # using 'tabulate' library

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52,2130,24.6,82,2,vw pickup
395,32.0,4,135.0,84,2295,11.6,82,1,dodge rampage
396,28.0,4,120.0,79,2625,18.6,82,1,ford ranger


In [18]:
vh_csv_df.set_index('car name',inplace=True)
display(vh_csv_df)

Unnamed: 0_level_0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
car name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
chevrolet chevelle malibu,18.0,8,307.0,130,3504,12.0,70,1
buick skylark 320,15.0,8,350.0,165,3693,11.5,70,1
plymouth satellite,18.0,8,318.0,150,3436,11.0,70,1
amc rebel sst,16.0,8,304.0,150,3433,12.0,70,1
ford torino,17.0,8,302.0,140,3449,10.5,70,1
...,...,...,...,...,...,...,...,...
ford mustang gl,27.0,4,140.0,86,2790,15.6,82,1
vw pickup,44.0,4,97.0,52,2130,24.6,82,2
dodge rampage,32.0,4,135.0,84,2295,11.6,82,1
ford ranger,28.0,4,120.0,79,2625,18.6,82,1


In [19]:
display(vh_csv_df[(vh_csv_df['mpg']>20)].iloc[5]) # display 5th index series

mpg              25.0
cylinders           4
displacement    110.0
horsepower         87
weight           2672
acceleration     17.5
model year         70
origin              2
Name: peugeot 504, dtype: object