# pandas Basics

In [2]:
# Importing
# Notice that pandas is built on top of NumPy arrays
import pandas as pd
import numpy as np

In [3]:
# Creating DataFrames
# There are many methods you can use to create a DataFrame
raw_data = {'f_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 
            'l_name': ['Miller', 'Jacobson', '.', 'Milner', 'Cooze'], 
            'age': [42, 52, 36, 24, 73], 
            'pre_score': [4, 24, 31, None, None],
            'post_score': ["25,000", "94,000", 57, 62, 70]}
df = pd.DataFrame(raw_data,
                  columns=['f_name', 'l_name', 'age', 'pre_score', 'post_score'])
df

Unnamed: 0,f_name,l_name,age,pre_score,post_score
0,Jason,Miller,42,4.0,25000
1,Molly,Jacobson,52,24.0,94000
2,Tina,.,36,31.0,57
3,Jake,Milner,24,,62
4,Amy,Cooze,73,,70


In [3]:
# Task: Try creating a DatFrame from a list of lists or a NumPy array

In [4]:
# Saving the DataFrame as a CSV file
df.to_csv('SampleDataFrame.csv', index=False, encoding='utf-8')

In [5]:
# reading a csv as a pandas dataframe
df = pd.read_csv('SampleDataFrame.csv')

In [6]:
# Printing the columns of the DataFrame
df.columns

Index(['f_name', 'l_name', 'age', 'pre_score', 'post_score'], dtype='object')

In [7]:
# Finding the data type of each column
df.dtypes

f_name         object
l_name         object
age             int64
pre_score     float64
post_score     object
dtype: object

In [8]:
# Retrieving a column
df['f_name']

0    Jason
1    Molly
2     Tina
3     Jake
4      Amy
Name: f_name, dtype: object

In [9]:
# Retrieving several columns in a particular order
df[['f_name','age']]

Unnamed: 0,f_name,age
0,Jason,42
1,Molly,52
2,Tina,36
3,Jake,24
4,Amy,73


In [10]:
# Adding columns
df['alive'] = 1
df

Unnamed: 0,f_name,l_name,age,pre_score,post_score,alive
0,Jason,Miller,42,4.0,25000,1
1,Molly,Jacobson,52,24.0,94000,1
2,Tina,.,36,31.0,57,1
3,Jake,Milner,24,,62,1
4,Amy,Cooze,73,,70,1


In [11]:
# First `n` records
df.head(3)

Unnamed: 0,f_name,l_name,age,pre_score,post_score,alive
0,Jason,Miller,42,4.0,25000,1
1,Molly,Jacobson,52,24.0,94000,1
2,Tina,.,36,31.0,57,1


In [12]:
# Last `n` records
df.tail(3)

Unnamed: 0,f_name,l_name,age,pre_score,post_score,alive
2,Tina,.,36,31.0,57,1
3,Jake,Milner,24,,62,1
4,Amy,Cooze,73,,70,1


In [13]:
# Using `values` will return the np.array
df.values

array([['Jason', 'Miller', 42, 4.0, '25,000', 1],
       ['Molly', 'Jacobson', 52, 24.0, '94,000', 1],
       ['Tina', '.', 36, 31.0, '57', 1],
       ['Jake', 'Milner', 24, nan, '62', 1],
       ['Amy', 'Cooze', 73, nan, '70', 1]], dtype=object)

In [14]:
# Using `iloc` allows you to access the records in DataFrame format
df.iloc[1:3]

Unnamed: 0,f_name,l_name,age,pre_score,post_score,alive
1,Molly,Jacobson,52,24.0,94000,1
2,Tina,.,36,31.0,57,1


In [15]:
# Query
df[df.age > 50]

Unnamed: 0,f_name,l_name,age,pre_score,post_score,alive
1,Molly,Jacobson,52,24.0,94000,1
4,Amy,Cooze,73,,70,1


In [None]:
# Query with multiple arguments
# Task: Identify the problem here and try to fix it
df[(df.age <= 50) & (df.post_score > 40)]

In [16]:
# Sorting according to column(s)
df.sort_values(by=['age'], ascending=True)

Unnamed: 0,f_name,l_name,age,pre_score,post_score,alive
3,Jake,Milner,24,,62,1
2,Tina,.,36,31.0,57,1
0,Jason,Miller,42,4.0,25000,1
1,Molly,Jacobson,52,24.0,94000,1
4,Amy,Cooze,73,,70,1


## Important Details
All the operations above return a DataFrame. In order to make changes inplace (in the same DataFrame), you have overwrite the existing DataFrame with the return value and proceed. Otherwise, store it in a new variable to access it separately.

There are a number of other basic operations which you can learn on the go. Use StackOverflow to clear any doubts.

Take a look here for dealing with date-time in pandas (very useful): <br>
https://chrisalbon.com/python/data_wrangling/pandas_time_series_basics/