# Data Handling With Pandas

## Introduction
Pandas is built on numpy and can be used to interepet data. This notebook is used to maintain the collection of all useful tasks that can be performed with Pandas

## Setup

In [1]:
import pandas as pd
import numpy as np

file_path = 'data/example_data/data.csv'
data = pd.read_csv(file_path)
data

  return f(*args, **kwds)


Unnamed: 0,Customer Number,Customer Name,2016,2017,Percent Growth,Jan Units,Month,Day,Year,Active
0,10002.0,Quest Industries,"$125,000.00",$162500.00,30.00%,500,1,10,2015,Y
1,552278.0,Smith Plumbing,"$920,000.00","$101,2000.00",10.00%,700,6,15,2014,Y
2,23477.0,ACME Industrial,"$50,000.00",$62500.00,25.00%,125,3,29,2016,Y
3,24900.0,Brekke LTD,"$350,000.00",$490000.00,4.00%,75,10,27,2015,Y
4,651029.0,Harbor Co,"$15,000.00",$12750.00,-15.00%,Closed,2,2,2014,N


## Basic Inference and Data Access

In [2]:
# show first few rows
data.head(n=2) # default is 5

Unnamed: 0,Customer Number,Customer Name,2016,2017,Percent Growth,Jan Units,Month,Day,Year,Active
0,10002.0,Quest Industries,"$125,000.00",$162500.00,30.00%,500,1,10,2015,Y
1,552278.0,Smith Plumbing,"$920,000.00","$101,2000.00",10.00%,700,6,15,2014,Y


In [3]:
# Describe on csv data gives count, mean, variance and other statistical details about each parameter in the input 
data.describe()

Unnamed: 0,Customer Number,Month,Day,Year
count,5.0,5.0,5.0,5.0
mean,252337.2,4.4,16.6,2014.8
std,320838.999788,3.646917,11.414903,0.83666
min,10002.0,1.0,2.0,2014.0
25%,23477.0,2.0,10.0,2014.0
50%,24900.0,3.0,15.0,2015.0
75%,552278.0,6.0,27.0,2015.0
max,651029.0,10.0,29.0,2016.0


In [4]:
# List parameters(columns)
data.columns

Index(['Customer Number', 'Customer Name', '2016', '2017', 'Percent Growth',
       'Jan Units', 'Month', 'Day', 'Year', 'Active'],
      dtype='object')

In [5]:
# gives the data type of each input parameter
data.dtypes

Customer Number    float64
Customer Name       object
2016                object
2017                object
Percent Growth      object
Jan Units           object
Month                int64
Day                  int64
Year                 int64
Active              object
dtype: object

In [6]:
#Added info on input parameters
data.info

<bound method DataFrame.info of    Customer Number     Customer Name         2016          2017  \
0          10002.0  Quest Industries  $125,000.00    $162500.00   
1         552278.0    Smith Plumbing  $920,000.00  $101,2000.00   
2          23477.0   ACME Industrial   $50,000.00     $62500.00   
3          24900.0        Brekke LTD  $350,000.00    $490000.00   
4         651029.0         Harbor Co   $15,000.00     $12750.00   

  Percent Growth Jan Units  Month  Day  Year Active  
0         30.00%       500      1   10  2015      Y  
1         10.00%       700      6   15  2014      Y  
2         25.00%       125      3   29  2016      Y  
3          4.00%        75     10   27  2015      Y  
4        -15.00%    Closed      2    2  2014      N  >

In [7]:
# accessing data by parameter
data['Customer Number']

0     10002.0
1    552278.0
2     23477.0
3     24900.0
4    651029.0
Name: Customer Number, dtype: float64

In [8]:
# getting statistical values by paramter
data.Year.mean()

2014.8

## Data Conversion

In [9]:
# conversion using astype
data['Customer Number'].astype('int')

0     10002
1    552278
2     23477
3     24900
4    651029
Name: Customer Number, dtype: int64

In [10]:
# doesn't work as the values(like 15,000.00$) are not stright forward integeres
try:
    data['2016'].astype('float')
except Exception as e:
    print(e)

could not convert string to float: '$15,000.00'


In [11]:
# doesn't work as one of the values is 'Closed' (not a number)
try:
    data['Jan Units'].astype('int')
except Exception as  e:
    print(e)

invalid literal for int() with base 10: 'Closed'


In [12]:
# doesn't work as expected as it can't interpret from Y/N
data['Active'].astype('bool')

0    True
1    True
2    True
3    True
4    True
Name: Active, dtype: bool

In [13]:
# using to_numeric to convert to int
# coerce sets errored instance to NaN
pd.to_numeric(data['Jan Units'], errors='coerce')

0    500.0
1    700.0
2    125.0
3     75.0
4      NaN
Name: Jan Units, dtype: float64

In [14]:
# setting errored values to zero
pd.to_numeric(data['Jan Units'], errors='coerce').fillna(0)

0    500.0
1    700.0
2    125.0
3     75.0
4      0.0
Name: Jan Units, dtype: float64

In [15]:
# using to_datatime to generate date from individual parameters
pd.to_datetime(data[['Month', 'Day', 'Year']])

0   2015-01-10
1   2014-06-15
2   2016-03-29
3   2015-10-27
4   2014-02-02
dtype: datetime64[ns]

## Other Useful Tools

### apply

In [16]:
# Using a convertor function and apply to change currency to int
def convertor(value):
    result = value.replace(',','').replace('$','')
    return float(result)

data['2016'].apply(convertor)

0    125000.0
1    920000.0
2     50000.0
3    350000.0
4     15000.0
Name: 2016, dtype: float64

In [17]:
# Using lambdas and apply to change currency to int
data['2016'].apply(lambda x : x.replace(',','').replace('$','')).astype('float')

0    125000.0
1    920000.0
2     50000.0
3    350000.0
4     15000.0
Name: 2016, dtype: float64

### np.where

In [18]:
# where acts like if clause
np.where(data['Active'] == "Y", True, False)

array([ True,  True,  True,  True, False])

### dropna

In [19]:
# dropna is used to drop rows or columns with na.
# axis = 0 - rows with na are dropped
# axis = 1 - columns with na are dropped
pd.to_numeric(data['Jan Units'], errors='coerce').dropna(axis=0)

0    500.0
1    700.0
2    125.0
3     75.0
Name: Jan Units, dtype: float64

### Selecting columns

In [20]:
# this cell shows how to pick few necessary columns from input data
data[['Customer Number', '2016', 'Active']]

Unnamed: 0,Customer Number,2016,Active
0,10002.0,"$125,000.00",Y
1,552278.0,"$920,000.00",Y
2,23477.0,"$50,000.00",Y
3,24900.0,"$350,000.00",Y
4,651029.0,"$15,000.00",N


### Creating new DataFrame

In [21]:
# this way a new dataframe can be initialized.
result = pd.DataFrame({'Number': data['Customer Number'],
                     'Active' : data['Active']})

### Output to CSV 

In [22]:
# A dataframe can be output to csvfile
result.to_csv('output.csv', index=False)

## Customizing data import

In [23]:
data2 = pd.read_csv(file_path,
                   dtype={'Customer Number': 'int'},
                   converters={'2016': convertor,
                               '2017': convertor,
                               'Percent Growth': lambda x : float(x.replace('%',''))/100,
                               'Jan Units': lambda x: pd.to_numeric(x, errors='coerce'),
                               'Active': lambda x: np.where(x == "Y", True, False)
                              })

data2


Unnamed: 0,Customer Number,Customer Name,2016,2017,Percent Growth,Jan Units,Month,Day,Year,Active
0,10002,Quest Industries,125000.0,162500.0,0.3,500.0,1,10,2015,True
1,552278,Smith Plumbing,920000.0,1012000.0,0.1,700.0,6,15,2014,True
2,23477,ACME Industrial,50000.0,62500.0,0.25,125.0,3,29,2016,True
3,24900,Brekke LTD,350000.0,490000.0,0.04,75.0,10,27,2015,True
4,651029,Harbor Co,15000.0,12750.0,-0.15,,2,2,2014,False


## Sources
* http://pbpython.com/pandas_dtypes.html
* https://www.kaggle.com/dansbecker/explore-your-data
* https://www.kaggle.com/dansbecker/your-first-machine-learning-model
