In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.DataFrame({'A':[1,2,np.nan],'B':[5,np.nan,np.nan],'C':[1,2,3]})
df['States']="CA NV AZ".split()
df.set_index('States',inplace=True)
df

Unnamed: 0_level_0,A,B,C
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CA,1.0,5.0,1
NV,2.0,,2
AZ,,,3


In [3]:
df

Unnamed: 0_level_0,A,B,C
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CA,1.0,5.0,1
NV,2.0,,2
AZ,,,3


In [4]:
#Require that many non-NA values.
df.dropna(thresh = 3,axis=1,inplace=True)

In [5]:
print("\nDropping any rows with a NaN value\n",'-'*35, sep='')
print(df.dropna(axis=0))



Dropping any rows with a NaN value
-----------------------------------
        C
States   
CA      1
NV      2
AZ      3


In [6]:
print("\nDropping any column with a NaN value\n",'-'*35, sep='')
print(df.dropna(axis=1))



Dropping any column with a NaN value
-----------------------------------
        C
States   
CA      1
NV      2
AZ      3


In [7]:
print("\nDropping a row with a minimum 2 NaN value using 'thresh' parameter\n",'-'*68, sep='')
print(df.dropna(axis=0, thresh=4))


Dropping a row with a minimum 2 NaN value using 'thresh' parameter
--------------------------------------------------------------------
Empty DataFrame
Columns: [C]
Index: []


In [8]:
df

Unnamed: 0_level_0,C
States,Unnamed: 1_level_1
CA,1
NV,2
AZ,3


In [9]:
df.fillna(2)

Unnamed: 0_level_0,C
States,Unnamed: 1_level_1
CA,1
NV,2
AZ,3


In [10]:
print("\nFilling values with a default value\n",'-'*35, sep='')
print(df.fillna(value='FILL VALUE'))



Filling values with a default value
-----------------------------------
        C
States   
CA      1
NV      2
AZ      3


In [11]:
df

Unnamed: 0_level_0,C
States,Unnamed: 1_level_1
CA,1
NV,2
AZ,3


In [12]:
df.fillna(value=df['A'].mean())

KeyError: 'A'

In [None]:
print("\nFilling values with a computed value (mean of column A here)\n",'-'*60, sep='')
print(df.fillna(value=df['A'].mean()))

In [None]:
# Create dataframe
data = {'Company':['GOOG','GOOG','MSFT','MSFT','FB','FB'],
       'Person':['Sam','Charlie','Amy','Vanessa','Carl','Sarah'],
       'Sales':[200,120,340,124,243,350]}
df = pd.DataFrame(data)
df

In [None]:
g = df.groupby('Company')
g

In [None]:
byComp = df.groupby('Company')
print("\nGrouping by 'Company' column and listing mean sales\n",'-'*55, sep='')
print(byComp.mean())


In [None]:
print("\nGrouping by 'Company' column and listing sum of sales\n",'-'*55, sep='')
print(byComp.sum())


In [None]:
print("\nAll in one line of command (Stats for 'FB')\n",'-'*65, sep='')
print(pd.DataFrame(df.groupby('Company').describe().loc['FB']).transpose())


In [None]:
df.groupby("Company").describe().iloc[1:3,1:3]

In [None]:
df.groupby('Company').describe().loc['GOOG']['Sales'].to_frame()

In [None]:
df.groupby('Company').describe().loc['GOOG']['Sales']['std']


In [None]:
df.groupby('Company').describe()['Sales'].loc[['GOOG','MSFT'],['mean','std']]

In [None]:
d = (df.groupby('Company').describe())
df1 = d.transpose()
df1

In [None]:
df1.loc['Sales'].loc[['mean']]

In [None]:
(pd.DataFrame(df.groupby('Company').describe().loc['FB'])).transpose()

In [None]:
(pd.DataFrame(df.groupby('Company').describe().loc['FB'])).transpose()

In [None]:
print("\nSame type of extraction with little different command\n",'-'*68, sep='')
print(df.groupby('Company').describe().loc[['GOOG', 'MSFT']])

In [None]:
# Merging two data frames
# Creating data frames
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3'],
                        'C': ['C0', 'C1', 'C2', 'C3'],
                        'D': ['D0', 'D1', 'D2', 'D3']},
                        index=[0, 1, 2, 3])


In [None]:
df1

In [None]:
df2 = pd.DataFrame({'AD': ['A4', 'A5', 'A6', 'A7'],
                        'B': ['B4', 'B5', 'B6', 'B7'],
                        'C': ['C4', 'C5', 'C6', 'C7'],
                        'D': ['D4', 'D5', 'D6', 'D7']},
                         index=[0, 1, 2, 3])


In [None]:
df2

In [None]:
df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
                        'B': ['B8', 'B9', 'B10', 'B11'],
                        'C': ['C8', 'C9', 'C10', 'C11'],
                        'D': ['D8', 'D9', 'D10', 'D11']},
                        index=[0, 1, 2, 3])


In [None]:
df3

In [None]:
print("\nThe DataFrame number 1\n",'-'*30, sep='')
print(df1)


In [None]:
print("\nThe DataFrame number 2\n",'-'*30, sep='')
print(df2)


In [None]:
print("\nThe DataFrame number 3\n",'-'*30, sep='')
print(df3)

In [None]:
#concatenation
df_cat1 = pd.concat([df1,df2,df3], axis=1)
print("\nAfter concatenation along row\n",'-'*30, sep='')
df_cat1

In [None]:
df_cat1.iloc[3]

In [None]:
df_cat1.iloc[4]

In [None]:
df_cat2 = pd.concat([df1,df2,df3], axis=1)
print("\nAfter concatenation along column\n",'-'*60, sep='')
print(df_cat2)


In [None]:
df_cat1.fillna(value=0, inplace=True)
print("\nAfter filling missing values with zero\n",'-'*60, sep='')
print(df_cat1)

In [None]:
# merging by a common key

In [None]:
df1 = pd.DataFrame({'key1': ['K0', 'K8', 'K2', 'K3'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                    'C': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})
   
df2 = pd.DataFrame({'key2': ['K0', 'K1', 'K2', 'K3'],
                          'C': ['C0', 'C1', 'C2', 'C3'],
                          'D': ['D0', 'D1', 'D2', 'D3']})


In [None]:
df1

In [None]:
df2

In [None]:
pd.merge(df1,df2,how='left',left_on = 'key1',right_on='key2')

In [None]:
merge1= pd.merge(left,right,how='inner',on='key')
print("\nAfter simple merging with 'inner' method\n",'-'*50, sep='')
print(merge1)

In [None]:
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                     'key2': ['K0', 'K1', 'K0', 'K1'],
                        'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3']})
    
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                               'key2': ['K0', 'K0', 'K0', 'K0'],
                                  'C': ['C0', 'C1', 'C2', 'C3'],
                                  'D': ['D0', 'D1', 'D2', 'D3']})

In [None]:
left

In [None]:
right

In [None]:
pd.merge(left, right, on=['key1', 'key2'])

In [None]:
pd.merge(left, right, how='left',on=['key1', 'key2'])

In [None]:
pd.merge(left, right, how='right',on=['key1', 'key2'])

In [None]:
#join operators
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                     'B': ['B0', 'B1', 'B2']},
                      index=['K0', 'K1', 'K2']) 

right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                    'D': ['D0', 'D2', 'D3']},
                      index=['K0', 'K2', 'K3'])

In [None]:
left

In [None]:
right

In [None]:
left.join(right)

In [None]:
left.join(right, how='outer')

In [None]:
d1 = pd.DataFrame({'key1': ['K0', 'K1', 'K2', 'K3'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})
   
d2 = pd.DataFrame({'key2': ['K0', 'K1', 'K2', 'K3'],
                          'C': ['C0', 'C1', 'C2', 'C3'],
                          'D': ['D0', 'D1', 'D2', 'D3']})
d3 = pd.DataFrame({'key3': ['K0', 'K1', 'K2', 'K3'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})
df_cat1 = pd.concat([d1,d2,d3], axis=1)  
df_cat1 = pd.concat([d1,d2,d3])
merge1= pd.merge(d1,d2,how='outer',left_on='key1',right_on='key2')



In [None]:
merge1

In [None]:
df_cat1 = pd.concat([d1,d2,d3])

In [None]:
df_cat1 = pd.concat([d1,d2,d3], axis=1)  

In [None]:
# use of apply functions

In [None]:
# Define a function
def testfunc(x):
    if (x> 500):
        return (10*np.log10(x))
    else:
        return (x/10)

In [None]:
df = pd.DataFrame({'col1':[1,2,3,4,5,6,7,8,9,10],
                   'col2':[444,555,666,444,333,222,666,777,666,555],
                   'col3':'aaa bb c dd eeee fff gg h iii j'.split()})
df.columns

In [None]:
df['col20'] = df['col2'].apply(testfunc)
df

In [None]:
df['FuncApplied'] = df['col2'].apply(lambda x : np.log(x))
print(df)

In [None]:
df['col3length']= df['col3'].apply(len)
print(df)

In [None]:
df['test']= df['FuncApplied'].apply(lambda x: np.sqrt(x))
df

In [None]:
print("\nSum of the column 'FuncApplied' is: ",df['FuncApplied'].sum())


In [None]:
print("Mean of the column 'FuncApplied' is: ",df['FuncApplied'].mean())


In [None]:
print("Std dev of the column 'FuncApplied' is: ",df['FuncApplied'].max())

In [None]:
print("Min and max of the column 'FuncApplied' are: ",df['FuncApplied'].min(),"and",df['FuncApplied'].max())

In [None]:
### Deletion, sorting, list of column and row names

In [None]:
print("\nName of columns\n",'-'*20, sep='')
print(df.columns)


In [None]:
l = list(df.columns)
print("\nColumn names in a list of strings for later manipulation:",l)

In [None]:
print("\nDeleting last column by 'del' command\n",'-'*50, sep='')
del df['col3length']
print(df)
df['col3length']= df['col3'].apply(len)

In [None]:
df.sort_values(by='col2') #inplace=False by default

In [None]:
df.sort_values(by='FuncApplied',ascending=False) #inplace=False by default

In [13]:
df = pd.DataFrame({'col1':[1,2,3,np.nan],
                   'col2':[None,555,666,444],
                   'col3':['abc','def','ghi','xyz']})
df.head()

Unnamed: 0,col1,col2,col3
0,1.0,,abc
1,2.0,555.0,def
2,3.0,666.0,ghi
3,,444.0,xyz


In [None]:
df.isnull()

In [None]:
df.fillna('FILL')

In [None]:
df1


In [None]:
df2

In [None]:
df3

In [None]:
pd.merge(df1, df2, how='inner')

In [None]:
pd.merge(df1, df2, how='outer')

In [None]:
pd.merge(df1, df2, how='left')

In [None]:
pd.merge(df1, df2, how='right')

In [19]:
df=pd.read_csv('https://raw.githubusercontent.com/PramodShenoy/911-Calls/master/911.csv')
df.head(2)

Unnamed: 0,lat,lng,desc,zip,title,timeStamp,twp,addr,e
0,40.297876,-75.581294,REINDEER CT & DEAD END; NEW HANOVER; Station ...,19525.0,EMS: BACK PAINS/INJURY,2015-12-10 17:40:00,NEW HANOVER,REINDEER CT & DEAD END,1
1,40.258061,-75.26468,BRIAR PATH & WHITEMARSH LN; HATFIELD TOWNSHIP...,19446.0,EMS: DIABETIC EMERGENCY,2015-12-10 17:40:00,HATFIELD TOWNSHIP,BRIAR PATH & WHITEMARSH LN,1


In [15]:
1) describe for cat and numerical column

2) lat type cast it to int, perform groupby and apply count on it

3) how many people are from same zip location

4) check time stamp datatype and extract day in new col

5) filter out all the zip codes which address starts with 'A'

6) try to map timestamp with day like monday, tuesday....

7) from title extract service detail


SyntaxError: unmatched ')' (<ipython-input-15-d5cce99ca2d4>, line 1)

In [103]:
[i[4:] for i in df['title']]

[' BACK PAINS/INJURY',
 ' DIABETIC EMERGENCY',
 ': GAS-ODOR/LEAK',
 ' CARDIAC EMERGENCY',
 ' DIZZINESS',
 ' HEAD INJURY',
 ' NAUSEA/VOMITING',
 ' RESPIRATORY EMERGENCY',
 ' SYNCOPAL EPISODE',
 'fic: VEHICLE ACCIDENT -',
 'fic: VEHICLE ACCIDENT -',
 'fic: VEHICLE ACCIDENT -',
 'fic: VEHICLE ACCIDENT -',
 'fic: VEHICLE ACCIDENT -',
 'fic: VEHICLE ACCIDENT -',
 'fic: VEHICLE ACCIDENT -',
 ' RESPIRATORY EMERGENCY',
 ' DIZZINESS',
 ' VEHICLE ACCIDENT',
 'fic: DISABLED VEHICLE -',
 'fic: VEHICLE ACCIDENT -',
 'fic: DISABLED VEHICLE -',
 ': APPLIANCE FIRE',
 'fic: DISABLED VEHICLE -',
 'fic: VEHICLE ACCIDENT -',
 ' GENERAL WEAKNESS',
 ' HEAD INJURY',
 ': CARBON MONOXIDE DETECTOR',
 'fic: VEHICLE ACCIDENT -',
 'fic: DISABLED VEHICLE -',
 'fic: VEHICLE ACCIDENT -',
 ' RESPIRATORY EMERGENCY',
 ' UNKNOWN MEDICAL EMERGENCY',
 'fic: DISABLED VEHICLE -',
 'fic: DISABLED VEHICLE -',
 'fic: VEHICLE ACCIDENT -',
 'fic: DISABLED VEHICLE -',
 'fic: DISABLED VEHICLE -',
 'fic: VEHICLE ACCIDENT -',
 ' VEHI

In [None]:
df['zip'].value_counts().to_frame()

In [None]:
df.dtypes[['timeStamp']]

In [None]:
new=pd.to_datetime(df['timeStamp'])
new.dt.year
new.dt.month
new.dt.hour
df['timeStamp']=new.dt.day
df

In [None]:
df.describe()

In [None]:
df.describe(include='object')

In [None]:
df.head(1)

In [None]:
a=np.where(df.dtypes=='object')
a

In [None]:
df.iloc[a].describe()

In [None]:
b=df.dtypes[df.dtypes=='object'].index
df[b].describe()

In [None]:
df.head(1)

In [None]:
df['lat']=df['lat'].astype(int)

In [None]:
df.groupby('lat').count()

In [None]:
df.groupby('zip').describe()

In [None]:
df.head()

In [None]:
 how many people are from same zip location

In [None]:
a=df[['desc','zip']]
a.groupby(['desc','zip']).count()