# Introduction to Pandas (Python Libarabry for Data Analysis)

In [1]:
import pandas as pd

# Pandas Data Structures 
     - Pandas Series Object
     - Pandas Data Frame

### Pandas Series Object
 - Pandas series is a collection of values just like python list but in vertical .
 - There will be index available in series. 

In [2]:
series1 = pd.Series([6,12,24,36,6,12,12])

In [3]:
series1

0     6
1    12
2    24
3    36
4     6
5    12
6    12
dtype: int64

In [4]:
# series assigns default indexing.
series1.index

RangeIndex(start=0, stop=7, step=1)

In [5]:
series1.values

array([ 6, 12, 24, 36,  6, 12, 12], dtype=int64)

In [6]:
# we can create customise indexing. 
series2 = pd.Series([6,12,24,36,6,12,12],index=['apples','oranges','peeches','mangoes','grapes','olives','bananas'])
series2

apples      6
oranges    12
peeches    24
mangoes    36
grapes      6
olives     12
bananas    12
dtype: int64

In [7]:
series2['mangoes']

36

In [8]:
series2['olives']

12

In [9]:
# even we have provided our customise indices, defaults indices are also active and available.
# accessing a value using default index
series2[4]

6

In [10]:
grades = pd.Series(['A', 'B','A','A*','B','C','D','A','B','A*'],index=[1,2,3,4,5,6,7,8,9,10])
grades

1      A
2      B
3      A
4     A*
5      B
6      C
7      D
8      A
9      B
10    A*
dtype: object

In [11]:
grades[10]

'A*'

In [12]:
grades[3]

'A'

In [32]:
#length of series 
len(grades)

10

In [33]:
grades.max()  # ASCII codes

'D'

In [34]:
grades.min()

'A'

In [35]:
grades.unique()

array(['A', 'B', 'A*', 'C', 'D'], dtype=object)

In [36]:
grades.value_counts(ascending=True)

D     1
C     1
A*    2
B     3
A     3
dtype: int64

In [37]:
grades.describe()

count     10
unique     5
top        A
freq       3
dtype: object

In [38]:
grades.all() <= 'D'

True

In [39]:
grades.any() <'D'

True

In [40]:
newdata = pd.Series(['A'],index=[22])
newdata

22    A
dtype: object

In [41]:
grades.append(newdata)

1      A
2      B
3      A
4     A*
5      B
6      C
7      D
8      A
9      B
10    A*
22     A
dtype: object

In [42]:
new_grades = pd.Series(['A', 'B','A','A*','B','C','D','A','B','A*'], index=['Ali', 'Hassan', 'Nasir', 'Shoaib', 'Kashif',"Ahmed", 'Sohail', 'Shakeel', 'Asad','Hamza'])
new_grades

Ali         A
Hassan      B
Nasir       A
Shoaib     A*
Kashif      B
Ahmed       C
Sohail      D
Shakeel     A
Asad        B
Hamza      A*
dtype: object

In [43]:
# We slice a series based on integer indices
# and in string indices extreme index is not also included.
new_grades[1:6]

Hassan     B
Nasir      A
Shoaib    A*
Kashif     B
Ahmed      C
dtype: object

In [44]:
# We slice a series using string indices.
# and in string indices extreme index is included.
new_grades['Nasir':'Shakeel']

# better use customize index by loc 
new_grades.loc['Nasir':'Shakeel']

Nasir       A
Shoaib     A*
Kashif      B
Ahmed       C
Sohail      D
Shakeel     A
dtype: object

In [45]:
grades

1      A
2      B
3      A
4     A*
5      B
6      C
7      D
8      A
9      B
10    A*
dtype: object

In [46]:
grades.loc[4:7]

4    A*
5     B
6     C
7     D
dtype: object

In [47]:
grades.iloc[4:7]

5    B
6    C
7    D
dtype: object

In [48]:
# fancy Indexing 
# we provide a list of indices.
grades[   [6,4,1,2]  ]  

6     C
4    A*
1     A
2     B
dtype: object

In [49]:
salaries = pd.Series([60000,80000,90000,450000,45000,50000,40000,100000,56000,78000],index=new_grades.index)

In [50]:
salaries

Ali         60000
Hassan      80000
Nasir       90000
Shoaib     450000
Kashif      45000
Ahmed       50000
Sohail      40000
Shakeel    100000
Asad        56000
Hamza       78000
dtype: int64

In [51]:
salaries + salaries*.10

Ali         66000.0
Hassan      88000.0
Nasir       99000.0
Shoaib     495000.0
Kashif      49500.0
Ahmed       55000.0
Sohail      44000.0
Shakeel    110000.0
Asad        61600.0
Hamza       85800.0
dtype: float64

In [52]:
# apply <------>lambda
# lambda is function anonymous
salaries.apply(lambda s: s+s*.10   )

Ali         66000.0
Hassan      88000.0
Nasir       99000.0
Shoaib     495000.0
Kashif      49500.0
Ahmed       55000.0
Sohail      44000.0
Shakeel    110000.0
Asad        61600.0
Hamza       85800.0
dtype: float64

In [53]:
# increase 5% of salaries if salary is more than or equal to 100k,
# increase 10% of salaries if salary is more than or equal to 50k,
# increase 20 of salaries if salary is less 50k,

salaries.apply(lambda s: s+ s*.05 if s>=100000 else  s+ s*.10 if s >=50000 else s+s*.20 )

Ali         66000.0
Hassan      88000.0
Nasir       99000.0
Shoaib     472500.0
Kashif      54000.0
Ahmed       55000.0
Sohail      48000.0
Shakeel    105000.0
Asad        61600.0
Hamza       85800.0
dtype: float64

In [54]:
def apply_increment(salary):
    if salary >=100000:
        return salary+salary*.05
    elif salary >=50000:
        return salary+salary*.10
    else:
        return salary+salary*.20

In [55]:
salaries.apply(apply_increment)

Ali         66000.0
Hassan      88000.0
Nasir       99000.0
Shoaib     472500.0
Kashif      54000.0
Ahmed       55000.0
Sohail      48000.0
Shakeel    105000.0
Asad        61600.0
Hamza       85800.0
dtype: float64

In [56]:
import numpy as np

s = pd.Series([2, 0, 4, 8, np.nan])
s

0    2.0
1    0.0
2    4.0
3    8.0
4    NaN
dtype: float64

In [57]:
s[s.between(1,4)]

0    2.0
2    4.0
dtype: float64

In [58]:
s[(s>=1) & (s<=4) ]

0    2.0
2    4.0
dtype: float64

In [59]:
salaries[salaries >=100000]
# boolean indexing

Shoaib     450000
Shakeel    100000
dtype: int64

In [60]:
salaries[[True,False,True,False,True,False,False,False,True, True]]

Ali       60000
Nasir     90000
Kashif    45000
Asad      56000
Hamza     78000
dtype: int64

In [61]:
salaries >=100000

Ali        False
Hassan     False
Nasir      False
Shoaib      True
Kashif     False
Ahmed      False
Sohail     False
Shakeel     True
Asad       False
Hamza      False
dtype: bool

In [62]:
salaries.max()

450000

In [63]:
salaries.argmax()

The current behaviour of 'Series.argmax' is deprecated, use 'idxmax'
instead.
The behavior of 'argmax' will be corrected to return the positional
maximum in the future. For now, use 'series.values.argmax' or
'np.argmax(np.array(values))' to get the position of the maximum
row.
  """Entry point for launching an IPython kernel.


'Shoaib'

In [64]:
salaries

Ali         60000
Hassan      80000
Nasir       90000
Shoaib     450000
Kashif      45000
Ahmed       50000
Sohail      40000
Shakeel    100000
Asad        56000
Hamza       78000
dtype: int64

# Pandas DataFrame
 - A data frame has 1 or more than omne columns
 

In [65]:
result = {'name':          ['Nasir','Asad', 'Fahad','Zeeshan', 'Shan', 'Shoaib', 'Ali', 'Hassan', 'Yasir', 'Junaid'],
          'roll':          [11,22,33,44,55,66,77,88,99,111],
          'python':        [78,67,89,90,91,72,76,89,67,np.nan],
          'excel':         [np.nan,87,67,np.nan,78,90,76,78,90,54],
           'power_bi':     [45,67,89,87,67,90,65,67,np.nan,90],
          'pandas':        [81,85,89,87,41,45,96,56,93,78],
         'machine_learning':[96,95,84,85,82,81,74,75,96,90],
         'statistics':     [67,np.nan,45,98,75,71,70,60,90,93]}
result

{'name': ['Nasir',
  'Asad',
  'Fahad',
  'Zeeshan',
  'Shan',
  'Shoaib',
  'Ali',
  'Hassan',
  'Yasir',
  'Junaid'],
 'roll': [11, 22, 33, 44, 55, 66, 77, 88, 99, 111],
 'python': [78, 67, 89, 90, 91, 72, 76, 89, 67, nan],
 'excel': [nan, 87, 67, nan, 78, 90, 76, 78, 90, 54],
 'power_bi': [45, 67, 89, 87, 67, 90, 65, 67, nan, 90],
 'pandas': [81, 85, 89, 87, 41, 45, 96, 56, 93, 78],
 'machine_learning': [96, 95, 84, 85, 82, 81, 74, 75, 96, 90],
 'statistics': [67, nan, 45, 98, 75, 71, 70, 60, 90, 93]}

In [66]:
df  = pd.DataFrame(result)

In [67]:
df

Unnamed: 0,name,roll,python,excel,power_bi,pandas,machine_learning,statistics
0,Nasir,11,78.0,,45.0,81,96,67.0
1,Asad,22,67.0,87.0,67.0,85,95,
2,Fahad,33,89.0,67.0,89.0,89,84,45.0
3,Zeeshan,44,90.0,,87.0,87,85,98.0
4,Shan,55,91.0,78.0,67.0,41,82,75.0
5,Shoaib,66,72.0,90.0,90.0,45,81,71.0
6,Ali,77,76.0,76.0,65.0,96,74,70.0
7,Hassan,88,89.0,78.0,67.0,56,75,60.0
8,Yasir,99,67.0,90.0,,93,96,90.0
9,Junaid,111,,54.0,90.0,78,90,93.0


In [156]:
# sales = pd.read_excel('SaleData.xlsx')
# sales = pd.read_excel(r'C:\Users\Nasir Hussain\Desktop\SaleData.xlsx')
# sales = pd.read_excel('C:/Users/Nasir Hussain/Desktop/SaleData.xlsx')

In [157]:
type(df)

pandas.core.frame.DataFrame

In [158]:
df.shape

(10, 8)

In [159]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 8 columns):
name                10 non-null object
roll                10 non-null int64
python              9 non-null float64
excel               8 non-null float64
power_bi            9 non-null float64
pandas              10 non-null int64
machine_learning    10 non-null int64
statistics          9 non-null float64
dtypes: float64(4), int64(3), object(1)
memory usage: 768.0+ bytes


In [69]:
df.isnull().sum()

name                0
roll                0
python              1
excel               2
power_bi            1
pandas              0
machine_learning    0
statistics          1
dtype: int64

In [161]:
df.notnull().sum()

name                10
roll                10
python               9
excel                8
power_bi             9
pandas              10
machine_learning    10
statistics           9
dtype: int64

# Acessing column or columns

In [162]:
df['machine_learning']

0    96
1    95
2    84
3    85
4    82
5    81
6    74
7    75
8    96
9    90
Name: machine_learning, dtype: int64

In [163]:
df['name']

0      Nasir
1       Asad
2      Fahad
3    Zeeshan
4       Shan
5     Shoaib
6        Ali
7     Hassan
8      Yasir
9     Junaid
Name: name, dtype: object

In [164]:
df['python']

0    78.0
1    67.0
2    89.0
3    90.0
4    91.0
5    72.0
6    76.0
7    89.0
8    67.0
9     NaN
Name: python, dtype: float64

In [165]:
df['pandas']

0    81
1    85
2    89
3    87
4    41
5    45
6    96
7    56
8    93
9    78
Name: pandas, dtype: int64

In [166]:
df[['pandas', 'machine_learning']]

Unnamed: 0,pandas,machine_learning
0,81,96
1,85,95
2,89,84
3,87,85
4,41,82
5,45,81
6,96,74
7,56,75
8,93,96
9,78,90


In [70]:
df.columns

Index(['name', 'roll', 'python', 'excel', 'power_bi', 'pandas',
       'machine_learning', 'statistics'],
      dtype='object')

In [171]:
df.loc[:,'roll':'pandas'] # [row,col]>> [strow:endrow, stcol:endcol]

Unnamed: 0,roll,python,excel,power_bi,pandas
0,11,78.0,,45.0,81
1,22,67.0,87.0,67.0,85
2,33,89.0,67.0,89.0,89
3,44,90.0,,87.0,87
4,55,91.0,78.0,67.0,41
5,66,72.0,90.0,90.0,45
6,77,76.0,76.0,65.0,96
7,88,89.0,78.0,67.0,56
8,99,67.0,90.0,,93
9,111,,54.0,90.0,78


In [172]:
df.loc[2:6,'roll':'pandas']

Unnamed: 0,roll,python,excel,power_bi,pandas
2,33,89.0,67.0,89.0,89
3,44,90.0,,87.0,87
4,55,91.0,78.0,67.0,41
5,66,72.0,90.0,90.0,45
6,77,76.0,76.0,65.0,96


In [174]:
df.iloc[2:6,2:6]

Unnamed: 0,python,excel,power_bi,pandas
2,89.0,67.0,89.0,89
3,90.0,,87.0,87
4,91.0,78.0,67.0,41
5,72.0,90.0,90.0,45
