# Data Handling With Pandas

## Introduction
Pandas is built on numpy and can be used to interepet data. This notebook is used to maintain the collection of all useful tasks that can be performed with Pandas

## Setup

In [None]:
import pandas as pd
import numpy as np

file_path = 'data/example_data/data.csv'
data = pd.read_csv(file_path)
data

## Basic Inference and Data Access

In [None]:
# show first few rows
data.head(n=2) # default is 5

In [None]:
# Describe on csv data gives count, mean, variance and other statistical details about each parameter in the input 
data.describe()

In [None]:
# List parameters(columns)
data.columns

In [None]:
# gives the data type of each input parameter
data.dtypes

In [None]:
#Added info on input parameters
data.info

In [None]:
# accessing data by parameter
data['Customer Number']

In [None]:
# getting statistical values by paramter
data.Year.mean()

## Data Conversion

In [None]:
# conversion using astype
data['Customer Number'].astype('int')

In [None]:
# doesn't work as the values(like 15,000.00$) are not stright forward integeres
try:
    data['2016'].astype('float')
except Exception as e:
    print(e)

In [None]:
# doesn't work as one of the values is 'Closed' (not a number)
try:
    data['Jan Units'].astype('int')
except Exception as  e:
    print(e)

In [None]:
# doesn't work as expected as it can't interpret from Y/N
data['Active'].astype('bool')

In [None]:
# using to_numeric to convert to int
# coerce sets errored instance to NaN
pd.to_numeric(data['Jan Units'], errors='coerce')

In [None]:
# setting errored values to zero
pd.to_numeric(data['Jan Units'], errors='coerce').fillna(0)

In [None]:
# using to_datatime to generate date from individual parameters
pd.to_datetime(data[['Month', 'Day', 'Year']])

## Other Useful Tools

### apply

In [None]:
# Using a convertor function and apply to change currency to int
def convertor(value):
    result = value.replace(',','').replace('$','')
    return float(result)

data['2016'].apply(convertor)

In [None]:
# Using lambdas and apply to change currency to int
data['2016'].apply(lambda x : x.replace(',','').replace('$','')).astype('float')

### np.where

In [None]:
# where acts like if clause
np.where(data['Active'] == "Y", True, False)

### dropna

In [None]:
# dropna is used to drop rows or columns with na.
# axis = 0 - rows with na are dropped
# axis = 1 - columns with na are dropped
pd.to_numeric(data['Jan Units'], errors='coerce').dropna(axis=0)

### Selecting columns

In [None]:
# this cell shows how to pick few necessary columns from input data
data[['Customer Number', '2016', 'Active']]

## Customizing data import

In [None]:
data2 = pd.read_csv(file_path,
                   dtype={'Customer Number': 'int'},
                   converters={'2016': convertor,
                               '2017': convertor,
                               'Percent Growth': lambda x : float(x.replace('%',''))/100,
                               'Jan Units': lambda x: pd.to_numeric(x, errors='coerce'),
                               'Active': lambda x: np.where(x == "Y", True, False)
                              })

data2


## Sources
* http://pbpython.com/pandas_dtypes.html
* https://www.kaggle.com/dansbecker/explore-your-data
* https://www.kaggle.com/dansbecker/your-first-machine-learning-model
