# Introduction To Pandas

In [None]:
!pip install numpy 

In [None]:
!pip install pandas 

In [2]:
import pandas as pd   # pandas perform data manipulation, slicing and dicing data
import numpy as np    #NumPy provide efficient data operations on arrays, matrices and algebra

In [3]:
df = pd.read_csv("Salaries.csv")   # Pandas allows importing data from various file formats
df 

Unnamed: 0,rank,discipline,phd,service,sex,salary
0,Prof,B,56,49,Male,186960
1,Prof,A,12,6,Male,93000
2,Prof,A,23,20,Male,110515
3,Prof,A,40,31,Male,131205
4,Prof,B,20,18,Male,104800
...,...,...,...,...,...,...
73,Prof,B,18,10,Female,105450
74,AssocProf,B,19,6,Female,104542
75,Prof,B,17,17,Female,124312
76,Prof,A,28,14,Female,109954


# Functions are customizable, hence there is a parenthesis, where we can pass argument to modify the output accordingly. 

In [None]:
df.head()  # shows first 5 records by default

In [None]:
df.head(10)   # shows first n records, where n is no. passed to function df.head()

In [None]:
df.tail()   # shows last 5 records by default

In [None]:
df.tail(10)  # shows last n records, where n is no. passed to function df.tail()

# Attributes are not customizable, hence there is no parenthesis where we can pass arguments.

In [None]:
df.dtypes # shows data type of all variable in df; here, object means string

In [None]:
df["salary"].dtype  #data type of specified variable; [ ] is used to refer a column from df

In [None]:
df.shape   # shows shape i.e (no. of rows, no. of columns) of given df     

In [None]:
df.size    # shows total no. of elements in df

In [None]:
df.ndim   #gives no. of dimensions

In [None]:
#data frame is made up of row index,column index and values

df.index  # gives row indexes

In [None]:
df.columns #gives column indexes

In [None]:
df.axes  #gives row indexes and column indexes combined

In [None]:
df.values  #gives values of data frame

In [None]:
df.describe()  #summary of data frame

In [4]:
df.info()     #gives names of columns, no. of non null entries, and data type of each column

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78 entries, 0 to 77
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   rank        78 non-null     object
 1   discipline  78 non-null     object
 2   phd         78 non-null     int64 
 3   service     78 non-null     int64 
 4   sex         78 non-null     object
 5   salary      78 non-null     int64 
dtypes: int64(3), object(3)
memory usage: 3.8+ KB


In [None]:
# df.info , df.describe if typed without () then both willl print df as it is 

In [None]:
df.min()  # min for categorical variable -> gives alphabetically 1st element

In [None]:
df.max()   # max for categorical variable -> gives alphabetically last element

In [None]:
df["phd"].count()  #count() gives no. of entries in specified column

In [None]:
df["sex"].value_counts()  #value_counts() is for categorical variable,

In [None]:
df[["rank","sex"]].value_counts()

In [None]:
df.mean()  # gives mean for numerical variable

In [None]:
df["phd"].mean()  # mean of specified variable "phd"

In [None]:
df.median()

In [5]:
df.var()  # variance 

  df.var()


phd        1.562106e+02
service    1.473740e+02
salary     8.005313e+08
dtype: float64

In [6]:
df.std()    # standard deviation

  df.std()    # gives standard deviation


phd           12.498425
service       12.139768
salary     28293.661022
dtype: float64

In [7]:
df.skew()
#skewness +ve -> right skewed data -> mean>median
#skewness -ve -> left skewed data -> mean<median
#skewness zero -> perfectly symmetrical data -> mean = median

  df.skew()


phd        0.634366
service    0.913750
salary     0.452103
dtype: float64

In [8]:
df.kurt()
#kurtosis > 3 -> sharp peak -> thin tail (very less observations at ends)
#kurtosis < 3 -> flat distribution -> fat tails (most of observations at ends)
#kurtosis = 3 -> normal distribution

  df.kurt()


phd        0.042504
service    0.608981
salary    -0.401713
dtype: float64

In [None]:
df.sample(10)  # 10 random samples from df

In [None]:
df.sample(frac=0.1)  # no. of samples given is 10% of no. of entries in df

In [None]:
df.dropna()  #drop missing values

In [None]:
df.salary.head().mean()

In [None]:
df["salary"].head().mean()  # prefered way over above code
# gives mean of first 5 salary entries.

In [None]:
df[["salary","service"]].head().mean()  #for 2 columns
# first [] refers to column, second [] indicates list of columns
# list ["salary","service"] is given as input since it takes only one ip

In [None]:
#groupby is used for categorical variables
df.groupby("sex").mean()
#groups sex into male and female and then gives mean of all numerical columns for male and female separately

In [None]:
df.groupby("sex")[["salary"]].mean()
#groups sex into male and female and then gives mean of their salary 

# Slicing by value -> Conditions -> Slices just rows, not columns

In [None]:
# for numerical variable, there are 6 ways of conditions, i.e ==, !=, >=, <= ,>, < 
# for categorical variable, there are 2 ways of condotions, i.e ==, !=

In [None]:
df["salary"]>=150000  #give serial no.s having salary >= 1.5l

In [None]:
df[df["salary"]>=150000]   ##give entries having salary >= 1.5l

In [None]:
df[df["discipline"]=="B"]

In [None]:
#passed by reference(prefered way)
con1=df["sex"]=="Male"
con2=df["salary"]>=160000

df[con1 & con2]

In [None]:
#passed by value
df[(df["salary"]>150000) & (df["rank"]=="Prof")]

# Slicing by index nos. -> loc, iloc -> Slices both rows and columns

In [None]:
#iloc -> give index no. of column 
#loc -> give name of column

In [None]:
df.iloc[:,0:3]   # [rows,columns] ->gives all rows and column 0 to 2

In [None]:
df.loc[0:10, "rank":"phd"]   # gives row entries from 0 to 9 for column rank to phd

In [None]:
#if you want columns which are not in sequence
df.loc[0:10, ["rank", "phd","salary"]]

# Slicing by both value(condition) & index

In [None]:
df[df["salary"]>150000][["rank","discipline"]]  