# Pandas missing values 

In [1]:
import pandas as pd
import numpy as np

In [2]:
d={'A':[1,2,np.nan],'B':[5,np.nan,np.nan],'C':[1,2,3]}
d

{'A': [1, 2, nan], 'B': [5, nan, nan], 'C': [1, 2, 3]}

In [3]:
df =pd.DataFrame(d)
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


In [4]:
#drop all column with any nan value
df.dropna()

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [5]:
#drop all column with any nan value
df.dropna(axis=1)

Unnamed: 0,C
0,1
1,2
2,3


In [6]:
# set threshold to dropna 
df.dropna(thresh=2) #drop all row with atleast to nan value

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2


### Fill NAN values

In [8]:
df.fillna(value='Dummy')
#fill value in nan

Unnamed: 0,A,B,C
0,1,5,1
1,2,Dummy,2
2,Dummy,Dummy,3


In [9]:
#fral mean in nan value
df['A'].fillna(value=df['A'].mean())

0    1.0
1    2.0
2    1.5
Name: A, dtype: float64

# GroupBy 

In [14]:
d={'company':'google google FB FB Linux Linux Linux'.split(),
'Person':'sudhanshu ashutosh akash sonali roshani shaan vikash'.split(),
 'sales':[200,457,557,235,658,42,547]}
d

{'company': ['google', 'google', 'FB', 'FB', 'Linux', 'Linux', 'Linux'],
 'Person': ['sudhanshu',
  'ashutosh',
  'akash',
  'sonali',
  'roshani',
  'shaan',
  'vikash'],
 'sales': [200, 457, 557, 235, 658, 42, 547]}

In [15]:
df = pd.DataFrame(d)
df

Unnamed: 0,company,Person,sales
0,google,sudhanshu,200
1,google,ashutosh,457
2,FB,akash,557
3,FB,sonali,235
4,Linux,roshani,658
5,Linux,shaan,42
6,Linux,vikash,547


In [17]:
df.groupby('company') #return object

<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x7fc2d6fe7048>

In [18]:
gropbyCompany = df.groupby('company') #group by company
#now apply aggregation function
gropbyCompany.mean() #it return mean of only sales column as persons column is string
# So automaticaly ignored

Unnamed: 0_level_0,sales
company,Unnamed: 1_level_1
FB,396.0
Linux,415.666667
google,328.5


In [19]:
gropbyCompany.sum()

Unnamed: 0_level_0,sales
company,Unnamed: 1_level_1
FB,792
Linux,1247
google,657


In [20]:
gropbyCompany.std()

Unnamed: 0_level_0,sales
company,Unnamed: 1_level_1
FB,227.688384
Linux,328.329611
google,181.726443


In [22]:
gropbyCompany.sum().loc['FB'] #sales sum of FB only

sales    792
Name: FB, dtype: int64

In [23]:
df.groupby('company').mean().loc['FB']

sales    396.0
Name: FB, dtype: float64

In [24]:
df.groupby('company').count() #count number or element in each group

Unnamed: 0_level_0,Person,sales
company,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,2,2
Linux,3,3
google,2,2


In [27]:
df.groupby('company').min()  #you should not use min or max person name with group by

Unnamed: 0_level_0,Person,sales
company,Unnamed: 1_level_1,Unnamed: 2_level_1
FB,akash,235
Linux,roshani,42
google,ashutosh,200


In [28]:
df.groupby('company').describe()

Unnamed: 0_level_0,sales,sales,sales,sales,sales,sales,sales,sales
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
company,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
FB,2.0,396.0,227.688384,235.0,315.5,396.0,476.5,557.0
Linux,3.0,415.666667,328.329611,42.0,294.5,547.0,602.5,658.0
google,2.0,328.5,181.726443,200.0,264.25,328.5,392.75,457.0


In [29]:
df.groupby('company').describe().transpose()

Unnamed: 0,company,FB,Linux,google
sales,count,2.0,3.0,2.0
sales,mean,396.0,415.666667,328.5
sales,std,227.688384,328.329611,181.726443
sales,min,235.0,42.0,200.0
sales,25%,315.5,294.5,264.25
sales,50%,396.0,547.0,328.5
sales,75%,476.5,602.5,392.75
sales,max,557.0,658.0,457.0


In [30]:
df.groupby('company').describe().transpose()['google']

sales  count      2.000000
       mean     328.500000
       std      181.726443
       min      200.000000
       25%      264.250000
       50%      328.500000
       75%      392.750000
       max      457.000000
Name: google, dtype: float64