### The purpose of this article was to show some essential Pandas functions needed for making data analysis-ready

### To set up your environment for setting the working directory, locate data and other files etc.

In [None]:
# find out your current directory
import os
os.getcwd()

In [None]:
# if you want to set a different working directory
#os.chdir("path")

In [None]:
# to get a list of all files in the directory
os.listdir()

### Data Importing

In [None]:
# import pandas and numpy libraries
import pandas as pd
import numpy as np

In [None]:
# import a csv file from local machine
#df = pd.read_csv("beer.csv")
#df.head(4)

In [None]:
# import a csv file from an online database
df_Web = pd.read_csv("https://raw.githubusercontent.com/uiuc-cse/data-fa14/gh-pages/data/iris.csv")
df_Web.head(4)

## Data inspection

In [None]:
# description of index, entries, columns, data types, memory info
df_Web.info() 

In [None]:
# check out first few rows
df_Web.head(5) # head

In [None]:
# number of columns & rows
df_Web.shape 

In [None]:
# column names
df_Web.columns 

In [None]:
# number of unique values of a column
df_Web["species"].nunique()

In [None]:
# show unique values of a column
df_Web["species"].unique()

In [None]:
# number of unique values alltogether
df_Web.columns.nunique()

In [None]:
# value counts
df_Web['species'].value_counts()

### Dealing with NA values

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/uiuc-cse/data-fa14/gh-pages/data/iris.csv")

In [None]:
# show null/NA values per column
df_Web.isnull().sum()

In [None]:
# show null/NA values per column
df.isnull().sum()

In [None]:
# show NA values as % of total observations per column
df.isnull().sum()*100/len(df)

In [None]:
# drop all rows containing null
df1= df.dropna()
df1.isnull().sum()*100/len(df)

In [None]:
# drop all columns containing null
df2 = df.dropna(axis=1)
# show NA values as % of total observations per column
df2.isnull().sum()*100/len(df)

In [None]:
# drop columns with less than 5 NA values
df3 = df.dropna(axis=1, thresh=5)
# show NA values as % of total observations per column
df3.isnull().sum()*100/len(df)

In [None]:
# replace all na values with -9999
df4 = df.fillna(-9999)
df4.head(4)

In [None]:
# fill na values with NaN
df5 = df.fillna(np.NaN)
df5.head(4)

In [None]:
# fill na values with strings
df6=df.fillna("data missing")
df6.head(4)

In [None]:
# fill missing values with mean column values
df7=df.fillna(df.mean())
df7.head(4)

In [None]:
# replace na values of specific columns with mean value
df["sepal_length"] = df["sepal_length"].fillna(df["sepal_length"].mean())
df.isnull().sum()*100/len(df)

In [None]:
# interpolation of missing values (useful in time-series)
df7 = df["sepal_length"].interpolate()

### Column Operation

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/uiuc-cse/data-fa14/gh-pages/data/iris.csv")

In [None]:
# select a column
df["sepal_length"]

In [None]:
# select multiple columns and create a new dataframe X
X = df[["sepal_length", "sepal_width", "species"]]
X

In [None]:
# select a column by column number
df.iloc[:, [1,3,4]]

In [None]:
# drop a column from dataframe X
X = X.drop("sepal_length", axis=1)
X

In [None]:
# save all columns to a list
df.columns.tolist()

In [None]:
# Rename columns
df.rename(columns={"sepal_length": "Sepal_Length", "sepal_width": "Sepal_Width"})

In [None]:
# sorting values by column "sepalW" in ascending order
df.sort_values(by = "sepal_width", ascending = True)

In [None]:
# add new calculated column
df['newcol'] = df["sepal_length"]*2
df.head(4)

In [None]:
# create a conditional calculated column
df['newcol'] = ["short" if i<3 else "long" for i in df["sepal_width"]] 
df.head(4)

### Row Operation

In [None]:
# select rows 3 to 10
df.iloc[3:10,]

In [None]:
# select rows 3 to 49 and columns 1 to 3
df.iloc[3:50, 1:4]

In [None]:
# randomly select 10 rows
df.sample(10)

In [None]:
# find rows with specific strings
df[df["species"].isin(["setosa"])]

In [None]:
# conditional filtering
df[df.sepal_length >= 5]

In [None]:
# filtering rows with multiple values e.g. 0.2, 0.3
df[df["petal_width"].isin([0.2, 0.3])]

In [None]:
# multi-conditional filtering
df[(df.petal_length > 1) & (df.species=="Iris-setosa") | (df.sepal_width < 3)]

In [None]:
# drop rows
df.drop(df.index[1]) # 1 is row index to be deleted

### Grouping

In [None]:
# data grouped by column "species"
X = df.groupby("species")
X.head()

In [None]:
# return mean values of a column ("sepal_length" ) grouped by "species" column
df.groupby("newcol")["sepal_length"].mean()

In [None]:
# return mean values of ALL columns grouped by "species" category
df.groupby("species").mean()

In [None]:
# get counts in different categories
df.groupby("species").nunique() 