# Usefull Pandas Snippets

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## Importing Data

Read from CSV file

In [4]:
df = pd.read_csv('titanic.csv')

In [5]:
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05


## Creating Data

Using dataframe

In [8]:
pd.DataFrame({'Name':['Marie', 'John', 'Max', 'Jane'],'Age':[32, 28, 27, 33]}, index=['rank1','rank2','rank3','rank4'])

Unnamed: 0,Age,Name
rank1,32,Marie
rank2,28,John
rank3,27,Max
rank4,33,Jane


In [9]:
pd.DataFrame(np.random.randint(low=0, high=100, size=(5, 5)), columns=['A', 'B', 'C', 'D', 'E'])

Unnamed: 0,A,B,C,D,E
0,57,48,24,95,80
1,0,61,82,97,21
2,93,27,9,73,64
3,20,49,71,26,5
4,11,97,31,1,99


Using list comprehension

In [12]:
list = [x**2 for x in range(10)]

In [13]:
[x for x in list if x % 2 == 0]

[0, 4, 16, 36, 64]

## Cleaning

Drop NaN in fare

In [14]:
df.dropna(subset=["Fare"], inplace=True)

Return null values

In [15]:
df[df['Fare'].isnull()]

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare


Upper case all column names

In [16]:
df.columns = map(str.upper, df.columns)

Rename columns

In [17]:
df = df.rename(columns = {
    'Pclass':'Class',
    'Name':'Full Name',
})

Alternatively

In [18]:
df.columns = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']
df.columns = ['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'Siblings/Spouses Aboard', 'Parents/Children Aboard', 'Fare']

Filter columns containing "Aboard"

In [19]:
df_aboard = df.loc[:, df.columns[df.columns.str.contains('Aboard')].tolist()]

In [21]:
df_aboard.head()

Unnamed: 0,Siblings/Spouses Aboard,Parents/Children Aboard
0,1,0
1,1,0
2,0,0
3,1,0
4,0,0


Replace strings in column

In [23]:
df['Sex'] = df['Sex'].str.replace('Mr.', 'Mister')

Remove if contains character

In [29]:
# files = [file for file in files if "~" not in file]

Remove based on multiple values

In [30]:
df = df[~df['Name'].isin(['Invalid', 'Unknown'])]

Change type

In [31]:
df['Fare'] = df['Fare'].astype(float)

Reset index

In [32]:
df.reset_index(drop=True, inplace=True)

Convert to lower case

In [33]:
df['Sex'] = df['Sex'].str.lower()

Converting datas

In [34]:
# pd.to_datetime(d["colA"]).dt.strftime('%b-%y')

Deleting columns

In [35]:
del df['Siblings/Spouses Aboard']
del df['Parents/Children Aboard']

## Exploring

Number of rows

In [36]:
len(df.index)

887

Get info

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 887 entries, 0 to 886
Data columns (total 6 columns):
Survived    887 non-null int64
Pclass      887 non-null int64
Name        887 non-null object
Sex         887 non-null object
Age         887 non-null float64
Fare        887 non-null float64
dtypes: float64(2), int64(2), object(2)
memory usage: 34.7+ KB


Describe data

In [38]:
df.describe()

Unnamed: 0,Survived,Pclass,Age,Fare
count,887.0,887.0,887.0,887.0
mean,0.385569,2.305524,29.471443,32.30542
std,0.487004,0.836662,14.121908,49.78204
min,0.0,1.0,0.42,0.0
25%,0.0,2.0,20.25,7.925
50%,0.0,3.0,28.0,14.4542
75%,1.0,3.0,38.0,31.1375
max,1.0,3.0,80.0,512.3292


Select two columns

In [39]:
df[['Name', 'Fare']].head()

Unnamed: 0,Name,Fare
0,Mr. Owen Harris Braund,7.25
1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,71.2833
2,Miss. Laina Heikkinen,7.925
3,Mrs. Jacques Heath (Lily May Peel) Futrelle,53.1
4,Mr. William Henry Allen,8.05


Get titles

In [40]:
df["Title"] = df["Name"].str.split(" ").str[0]

Looking only at males

In [41]:
df[df['Sex'] == 'male'].head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Fare,Title
0,0,3,Mr. Owen Harris Braund,male,22.0,7.25,Mr.
4,0,3,Mr. William Henry Allen,male,35.0,8.05,Mr.
5,0,3,Mr. James Moran,male,27.0,8.4583,Mr.
6,0,1,Mr. Timothy J McCarthy,male,54.0,51.8625,Mr.
7,0,3,Master. Gosta Leonard Palsson,male,2.0,21.075,Master.


Looking only at males who survived

In [42]:
df[(df['Sex'] == 'male') & (df['Survived'] == 1)].head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Fare,Title
17,1,2,Mr. Charles Eugene Williams,male,23.0,13.0,Mr.
21,1,2,Mr. Lawrence Beesley,male,34.0,13.0,Mr.
23,1,1,Mr. William Thompson Sloper,male,28.0,35.5,Mr.
36,1,3,Mr. Hanna Mamee,male,18.0,7.2292,Mr.
54,1,1,Mr. Hugh Woolner,male,46.0,35.5,Mr.


Looking only at males who survived above the age of 50

In [43]:
df[(df['Sex'] == 'male') & (df['Survived'] == 1) & (df['Age'] > 50)].head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Fare,Title
446,1,1,Major. Arthur Godfrey Peuchen,male,52.0,30.5,Major.
567,1,2,Mr. George Harris,male,62.0,10.5,Mr.
584,1,1,Mr. Maxmillian Frolicher-Stehli,male,60.0,79.2,Mr.
627,1,1,Mr. Algernon Henry Wilson Barkworth,male,80.0,30.0,Mr.
644,1,1,Col. Oberst Alfons Simonius-Blumer,male,56.0,35.5,Col.


Set column value based on other columns

In [44]:
df['Note'] = np.nan

In [45]:
df.loc[(df['Sex'] == 'male') & (df['Survived'] == 1) & (df['Age'] > 50), ['Note']] = 'Male Above 50 Survived'

In [46]:
df['Note'].sort_values()[:3]

446    Male Above 50 Survived
567    Male Above 50 Survived
584    Male Above 50 Survived
Name: Note, dtype: object

Number of men who survived

In [47]:
len(df[(df['Sex'] == 'male') & (df['Survived'] == 1)])

109

Average age of men who survived

In [48]:
df[(df['Sex'] == 'male') & (df['Survived'] == 1)]['Age'].mean()

27.42816513761468

Filter by multiple values

In [49]:
df[df["Name"].isin(["Mr. Charles Eugene Williams", "Mr. Lawrence Beesley"])]

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Fare,Title,Note
17,1,2,Mr. Charles Eugene Williams,male,23.0,13.0,Mr.,
21,1,2,Mr. Lawrence Beesley,male,34.0,13.0,Mr.,


Highest fare paid

In [50]:
df.loc[df['Fare'].idxmax()]

Survived                  1
Pclass                    1
Name        Miss. Anna Ward
Sex                  female
Age                      35
Fare                512.329
Title                 Miss.
Note                    NaN
Name: 257, dtype: object

Sorting

In [51]:
df.sort_values(['Fare', 'Age'], ascending=[0,1]).head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Fare,Title,Note
257,1,1,Miss. Anna Ward,female,35.0,512.3292,Miss.,
733,1,1,Mr. Gustave J Lesurer,male,35.0,512.3292,Mr.,
676,1,1,Mr. Thomas Drake Martinez Cardeza,male,36.0,512.3292,Mr.,
27,0,1,Mr. Charles Alexander Fortune,male,19.0,263.0,Mr.,
87,1,1,Miss. Mabel Helen Fortune,female,23.0,263.0,Miss.,


Sort by multiple columns

In [52]:
df.sort_values(['Fare', 'Age'], ascending=[0,1]).head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Fare,Title,Note
257,1,1,Miss. Anna Ward,female,35.0,512.3292,Miss.,
733,1,1,Mr. Gustave J Lesurer,male,35.0,512.3292,Mr.,
676,1,1,Mr. Thomas Drake Martinez Cardeza,male,36.0,512.3292,Mr.,
27,0,1,Mr. Charles Alexander Fortune,male,19.0,263.0,Mr.,
87,1,1,Miss. Mabel Helen Fortune,female,23.0,263.0,Miss.,


Number of classes

In [53]:
df['Pclass'].unique()

array([3, 1, 2], dtype=int64)

Count of each class

In [54]:
df['Pclass'].value_counts()

3    487
1    216
2    184
Name: Pclass, dtype: int64

Find duplicates

In [55]:
df[df.duplicated(['Name'], keep=False)]

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Fare,Title,Note


## Looping

In [56]:
for index, row in df.iterrows():
    continue
#     print(index)
#     print(row)

Returning tuples

In [57]:
for row in df.itertuples():
    continue
#     print(row)

## Grouping

Group by class and aggregate fare by mean

In [58]:
df.groupby(['Pclass'])['Fare'].mean()

Pclass
1    84.154687
2    20.662183
3    13.707707
Name: Fare, dtype: float64

Pivot table

In [59]:
pd.pivot_table(df, values='Fare', index='Pclass', columns='Sex', aggfunc=np.mean)

Sex,female,male
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1,106.125798,67.226127
2,21.970121,19.741782
3,16.11881,12.695466


Sample weighted average aggregation function

In [60]:
agg_func = {'colA': ['sum'], 'colB': lambda x: np.average(x, weights=d.loc[x.index, 'colC'])}

## Miscellaneous

Functions in dictionary

In [61]:
func = {
    'times2': lambda x: print("The solution is: {}".format(x**2)),
    'times3': lambda x: print("The solution is: {}".format(x**3)),
    'times4': lambda x: print("The solution is: {}".format(x**4))
}

In [62]:
func['times2'](3)

The solution is: 9


## Recommended Cheat Sheets

* [Pandas DataFrame Object](http://www.webpages.uidaho.edu/~stevel/504/Pandas%20DataFrame%20Notes.pdf) from University of Idaho
* [Data Wrangling with Pandas](http://cs.umw.edu/~stephen/cpsc219/Pandas_Cheat_Sheet.pdf) from University of Mary Washington
* [Python for Data Science Pandas Basics](http://datacamp-community.s3.amazonaws.com/3857975e-e12f-406a-b3e8-7d627217e952) from DataCamp
* [Data Science Python Intermediate](https://www.dataquest.io/blog/large_files/python-cheat-sheet-intermediate.pdf) from Dataquest
* [Data Science Numpy](https://www.dataquest.io/blog/large_files/numpy-cheat-sheet.pdf) from Dataquest
* [Data Science Pandas](https://www.dataquest.io/blog/large_files/pandas-cheat-sheet.pdf) from Dataquest