In [None]:
import pandas as pd
from pandas import DataFrame

In [None]:
### BASIC SERIES AND DATAFRAMES###
#create a new dataframe in pandas

In [None]:
courseData = { 'courseCode': ['TM351', 'TU100', 'M269'],
               'points':[30, 60, 30],
               'level':['3', '1', '2']
             }
course_df = DataFrame(courseData)
course_df

In [None]:
#Adding a new column:
course_df['Degree Percentage'] = ((course_df['points'] / 360) * 100)
course_df

In [None]:
#create a new dataframe using named columns
courseColumns = ['courseCode', 'difficulty']
courseData2 = { 'courseCode':['TM351', 'TU100', 'M269'],
               'difficulty':['easy', 'medium', 'insane']
             }
course_df2 = DataFrame(data=courseData2, columns=courseColumns)
course_df2

In [None]:
#Reading data from an excel spreadsheet into a dataframe
licences_df = pd.read_excel('data/MKxx DRL0102 Driving-licence-data-Mar15.xls',
                            skiprows=27)[:(54-28)]

In [None]:
#Reading data from a csv file into a dataframe
#See some nice examples here : 
#http://chrisalbon.com/python/pandas_dataframe_importing_csv.html
df = pd.read_csv('../data/example.csv', na_values=sentinels, skiprows=3)

In [None]:
#projecting columns (note the double brackets)
course_df[['courseCode', 'level']]

In [None]:
#selecting rows based on column value
course_df[(course_df['courseCode'] == 'TM351' )]

In [None]:
#selecting row based on index number
course_df.ix[2]

In [None]:
#selecting a single cell can be achieved by using a combination of selection and projection
#e.g. to select points for TM351
course_df[course_df['courseCode']=='TM351']['points']

In [None]:
#or
course_df['points'].ix[0]

In [None]:
#renaming columns
attempt_counts_df.rename(columns={'Personal Identifier':'Student', 'Surname':'Quiz Attempts'}, inplace = True)
#or
quizzes_by_hour_and_state.columns = ['Hour started', 'Finished', 'In progress']

In [None]:
#examining dataframe types, and casting
course_df[ ['level', 'points'] ] = course_df[ ['level', 'points'] ].astype(float)
course_df.dtypes

In [None]:
#Adding 2 dataframes together (add rows)
#by default, all columns are added (outer join). To add only common columns use join='inner' option

In [None]:
courses_comb_df = pd.concat([course_df, course_df])
courses_comb_df

In [None]:
#merging 2 dataframes together (add columns)
#by default, rows are only added if they both contain the key column (inner join). To not lose any columns 
#use how='left', how='right' or how='full'
courses_comb_df = pd.merge(course_df, course_df2, on=['courseCode'])
courses_comb_df

In [None]:
### DATA PREPARATION AND CLEANSING###
#dealing with missing values
dirty_df = DataFrame(
    {'laptop': ['macbook pro', 'dell xps', 'microsoft surface pro'],
     'price': [1600, 1400, None]} #nb, None, NA and np.nan all work
)
#check which cells are null
dirty_df.isnull()

In [None]:
#replace null values with desired value
dirty_df.price.fillna(0, inplace='true')
dirty_df

In [None]:
#replace null values for specific columns:

answer_columns = list(icma_df) #get list of all columns
answer_columns = answer_columns[12:]  #discard first 12 columns 
icma_df[answer_columns] = icma_df[answer_columns].fillna(0) #fill just desired columns

In [None]:
#drop rows where specified column contains null values
clean_df = dirty_df.dropna(subset=['price'])
clean_df

In [None]:
#Simple string replace / regex pattern matching and replacing
samples = pd.DataFrame({'test_string' : ['aba', 
                                         'abcababcabca', 
                                         'adfddfda', 
                                         'The Cat sat on the Mat',
                                         'The Dog sat on the Cat',
                                         'The Elephant sat on the Dog']})
samples

In [None]:
#simple string replace. This works, but is pretty basic as cell contents
#has to be a full match
samples.replace({'test_string' : 'aba'}, 'XXXXXXXXX', inplace=True)
samples

In [None]:
#Find and replace using regex expression
#Use this great tool for checking regex expressions:
#https://regex101.com/
samples.replace({'test_string' : 'a[bd][a-z]+'}, 'XXX', regex=True, inplace=True)
samples

In [None]:
#### DESCRIPTIVE STATISTICS###
#count occurence of one value with respect to another using crosstabs method
#laptop_df = DataFrame({
#    'Name':['Apple MacBook Pro 15 inch','Apple MacBook Pro 13 inch','Microsoft Surface Book', 'ASUS ZenBook Pro', 'Dell XPS 15 Notebook'],
#    'Brand':['Apple','Apple','Microsoft','ASUS','Dell'],
#    'Memory (GB)':[16,8,16,16,16],
#    'Operating System':['Mac OS X El Capitan','Mac OS X El Capitan','Windows 10 Pro','Windows 10 (64 bit)','Windows 10 (64 bit)'],
#    'Processor Model':['i7-4770HQ','i5','i7?','i7?','i7-6700HQ'],
#    'Processor Generation':[4,5,6,6,6],
#    'Screen size (inches)':[15.4,13.3,13.5,15.6,15.6],
#    'Price':[1599,1149.99,2249,1499.95,1399.95],
#    'Guarantee (years)':[3,2,2,2,3]
#})
columns=['Division', 'Quarter','Sales']
finances_df = DataFrame({
'Division':['South','East','West','West','South','West','South','South','East','West'],
'Quarter':['q1-2016','q1-2016','q1-2016','q1-2016','q2-2016','q2-2016','q3-2016','q3-2016','q3-2016','q3-2016'],
'Sales':[800,1600,700,2100,900,2800,750,2000,600,400]
}, columns=columns)
finances_df

In [None]:
#Get the size of the x and y dimensions 
finances_df.shape()

In [None]:
#summarize one column with respect to another using crosstab 
#(e.g. summarise Division with respect to Quarter)
# use margins = 'true' to include totals column and row
pd.crosstab(finances_df['Division'], finances_df['Quarter'], margins='true')

In [None]:
#use a simple pivot table to group by 1 column, and count another
attempt_counts_df = pd.pivot_table(icma_df, index=['Personal Identifier'],values=["Surname"],aggfunc='count')

In [None]:
#find the index of the row with the max count
most_quiz_attempts_pi = attempt_counts_df.idxmax()

In [None]:
#use a pivot table to group by a column and sum another column
import numpy as np # we need this for the aggregate functions
finances_df.pivot_table(index=['Division'], aggfunc=np.sum) #gives sum across each division

In [None]:
#gives sum accross each Quarter, for each Division (hierarchy of columns)
finances_df.pivot_table(index=['Division','Quarter'], aggfunc=np.sum)

In [None]:
#we can also have more than one aggregate function column, as follows
finances_df.pivot_table(index=['Division','Quarter'], aggfunc=[np.sum,np.mean])

In [None]:
#Using the describe function on series and dataframes
aseries = pd.Series([1,2,9,6,5,6,8,6,2])
aseries

In [None]:
adf = pd.DataFrame({'size':[1, 2, 4, 5, 1, 4, 2, 8], 
                    'cost':[30, 42, 24, 75, 82, 50, 20, 34]})
adf

In [None]:
#use describe on a series to get count, mean, std etc.
aseries.describe() #we can also call mode(), median(), mean() etc.

In [None]:
#use describe on a dataframe to count, mean etc. plus percentiles e.g 25% of values are less than 28.5, 50% less than 38 etc.
adf.describe()

In [None]:
#Using aggregate functions on series e.g. sum, mean, median, mode, min, max etc.
aseries.mean()

In [None]:
#Using aggregate functions on dataframes e.g. sum, mean, min, max etc.
#use axis=0 to apply function to columns, axis=1 to apply to rows
#use skipna=True flag to skip non-numerics
adf.sum(axis=1)

In [None]:
#Use additive functions like cumulative sum, culumative max on dataframes

In [None]:
adf.cummax()

In [None]:
#finding the min / max value in a series
#, or row or column with the min / max value in a dataframe
aseries.idxmin() #returns the (zero based) index of the min value

In [None]:
adf.idxmax(axis=0) #returns the row index for the max value in each column

In [None]:
#Plotting dataframes using the plot function
PostData = pd.DataFrame( [('Box', 3, 10, 10, 3), ('Case', 8, 10,17,5), 
                          ('Drawer',12,23,32,12), ('Shelf', 52,23,46,17), 
                          ('Cabinet',7,10,27,9), ('Rack', 47,23,30,10), 
                          ('Bag',3,30,147,59), ('Hanger',30,23,62,23), 
                          ('Bracket', 3,30,92.5,37) ], 
                        columns=['itemname','price','postcost','weight','size'])
PostData

In [None]:
PostData.plot()

In [None]:
#some nice options here, including chart type (line, bar, bar-horizontal and scatter plots),
# x and y limits, colour, x and y labels
PostData.plot.bar(x='itemname', y='weight', title="My chart title", 
                      ylim=(0,max(PostData.weight)+10),
                      color='green')
plt.xlabel('Item Name')
plt.ylabel('Weight')

In [None]:
PostData.plot(x='weight', y='postcost', kind='scatter')

In [None]:
#Reshaping Data

In [None]:
#transpose rows / columns
df = pd.DataFrame(np.arange(6).reshape(3,2), columns=['a','b'])
df

In [None]:
df2 = df.transpose()

In [None]:
df2

In [None]:
#make long table into wide table
directorates = ['Community Wellbeing & Social Care',
                'Childrens Services',
                'Economy & Environment',
                'Resources',
                'Corporate']
expensetypes = ['Accommodation Costs',
                'Payment to Private Contractors',
                'Operational Equipment',
                'Professional Services']
import itertools
a = list(itertools.product(directorates, expensetypes))
unzipa = [t for t in zip(*a)]
df_long = pd.DataFrame({'directorates':unzipa[0],
                        'expense types':unzipa[1],
                        'total':np.random.randint(0,20000,len(directorates)*len(expensetypes))
                      })
df_long[:6]

In [None]:
#To make this 'long' table 'wide', we can use pivot:
df_wide = df_long.pivot('directorates', 'expense types', 'total')
df_wide

In [None]:
#To unstack the entire table into a long table again:
df_wide.stack() #nb we can reverse this again with unstack()

In [None]:
#or just unstack a specific column using melt
simple_melt = pd.melt(df_wide, id_vars=['directorates'], value_vars=expensetypes)
simple_melt
#NEED TO WORK OUT WHY THIS EXAMPLE ISN'T WORKING PROPERLY