# Pandas function practice

## Create a dataframe

In [127]:
import numpy as np
import pandas as pd

raw_data = {'names': ['Smith', 'Brown', 'Adams', 'Jones', 'Carpenter'], 'score': [93, 86, 63, 79, 55]}
df = pd.DataFrame(data=raw_data, columns=['names', 'score'])
df.head()

Unnamed: 0,names,score
0,Smith,93
1,Brown,86
2,Adams,63
3,Jones,79
4,Carpenter,55


## Straightforward operations on columns

In [128]:
df['double'] = 2*df['score']
df.head()

Unnamed: 0,names,score,double
0,Smith,93,186
1,Brown,86,172
2,Adams,63,126
3,Jones,79,158
4,Carpenter,55,110


We find that we need to add 5 marks to score, as a result of an error.

In [129]:
df['score'] += 5
df.head()

Unnamed: 0,names,score,double
0,Smith,98,186
1,Brown,91,172
2,Adams,68,126
3,Jones,84,158
4,Carpenter,60,110


## Functions on one column

Then we find we need to run a function on score, to increase scores below 70.

**This only works for another dataframe, e.g. df1[...] = df[...].apply(...)**

In [130]:
def adjust(x):
    if x < 70:
        return round(1.05 * x)
    else:
        return x
    
d = df['score'].apply(adjust)
d.head()

0    98
1    91
2    71
3    84
4    63
Name: score, dtype: int64

## Functions on index

In [131]:
df.index = df['score']
df.head()

Unnamed: 0_level_0,names,score,double
score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
98,Smith,98,186
91,Brown,91,172
68,Adams,68,126
84,Jones,84,158
60,Carpenter,60,110


In [132]:
df.index = [i for i in range(5)]
df.head()

Unnamed: 0,names,score,double
0,Smith,98,186
1,Brown,91,172
2,Adams,68,126
3,Jones,84,158
4,Carpenter,60,110


### *You cannot call a function on the index!!!*

## Lambda functions on column

Simple lambda with no externally defined function inside lambda.

Firstly, we are not as generous as double any longer.

In [133]:
df['less'] = df['double'].apply(lambda x: x/1.1)
df.head()

Unnamed: 0,names,score,double,less
0,Smith,98,186,169.090909
1,Brown,91,172,156.363636
2,Adams,68,126,114.545455
3,Jones,84,158,143.636364
4,Carpenter,60,110,100.0


Another lambda.

In [134]:
def adjust2(x):
    return (x**2 + x)**0.5

df['more'] = df['double'].apply(lambda x: adjust2(x))
df.head()

Unnamed: 0,names,score,double,less,more
0,Smith,98,186,169.090909,186.49933
1,Brown,91,172,156.363636,172.499275
2,Adams,68,126,114.545455,126.499012
3,Jones,84,158,143.636364,158.499211
4,Carpenter,60,110,100.0,110.498869


In [135]:
df['idx_'] = df.index
df.head()

Unnamed: 0,names,score,double,less,more,idx_
0,Smith,98,186,169.090909,186.49933,0
1,Brown,91,172,156.363636,172.499275,1
2,Adams,68,126,114.545455,126.499012,2
3,Jones,84,158,143.636364,158.499211,3
4,Carpenter,60,110,100.0,110.498869,4


Rearranging column order

In [136]:
cols = list(df.columns.values)
cols

['names', 'score', 'double', 'less', 'more', 'idx_']

In [137]:
# use rearranged list
cols2 = ['idx_', 'names', 'score', 'double', 'more']
df = df[cols2]
df

Unnamed: 0,idx_,names,score,double,more
0,0,Smith,98,186,186.49933
1,1,Brown,91,172,172.499275
2,2,Adams,68,126,126.499012
3,3,Jones,84,158,158.499211
4,4,Carpenter,60,110,110.498869


Now try an altruistic world. Your marks are the average marks of everyone else

In [138]:
def altriusm(i):
    count = 0
    for j in range(len(df['idx_'])):
        if j != i:
            count += j
    return count / len(df['idx_'])

In [139]:
z = altriusm(2)

In [140]:
print(z)

1.6


In [141]:
raw_data = {'names': ['Smith', 'Brown', 'Adams', 'Jones', 'Carpenter', 'Sanderson', 
                      'Smyth', 'Hooper', ' Williams', 'Tavistock'], 'score': [93, 86, 63, 79, 55, 80, 74, 45, 86, 66]}
df = pd.DataFrame(data=raw_data, columns=['names', 'score'])
df.head(10)

Unnamed: 0,names,score
0,Smith,93
1,Brown,86
2,Adams,63
3,Jones,79
4,Carpenter,55
5,Sanderson,80
6,Smyth,74
7,Hooper,45
8,Williams,86
9,Tavistock,66


In [142]:
raw_data2 = [{'name': 'Smith', 'score': 49, 'status': 'full-time', 'activity': 'high', 'address': 3},
             {'name': 'Jones', 'score': 65, 'status': 'part-time', 'activity': 'medium', 'address': 0},
             {'name': 'Able', 'score': 65, 'status': 'full-time', 'activity': 'medium', 'address': 4},
             {'name': 'Cutler', 'score': 0, 'status': 'part-time', 'activity': 'medium', 'address': 4},
             {'name': 'Williams', 'score': 0, 'status': 'full-time', 'activity': 'low', 'address': 4},
             {'name': 'Gower', 'score': 65, 'status': 'part-time', 'activity': 'low', 'address': 0},
             {'name': 'Turner', 'score': 0, 'status': 'full-time', 'activity': 'medium', 'address': 2},
             {'name': 'Appleby', 'score': 83, 'status': 'part-time', 'activity': 'high', 'address': 1},
             {'name': 'Archer', 'score': 81, 'status': 'part-time', 'activity': 'medium', 'address': 1},
             {'name': 'Hurst', 'score': 73, 'status': 'part-time', 'activity': 'high', 'address': 0}
            ]
                       
df2 = pd.DataFrame(data=raw_data2, columns=['name', 'score', 'status', 'activity', 'address'])
df2.head(10)

Unnamed: 0,name,score,status,activity,address
0,Smith,49,full-time,high,3
1,Jones,65,part-time,medium,0
2,Able,65,full-time,medium,4
3,Cutler,0,part-time,medium,4
4,Williams,0,full-time,low,4
5,Gower,65,part-time,low,0
6,Turner,0,full-time,medium,2
7,Appleby,83,part-time,high,1
8,Archer,81,part-time,medium,1
9,Hurst,73,part-time,high,0


In [143]:
df2.index

RangeIndex(start=0, stop=10, step=1)

In [144]:
df2.index[1]

1

In [145]:
type(df2.index[1])

int

In [146]:
df2.index.where(df2['name'] == 'Jones')

Float64Index([nan, 1.0, nan, nan, nan, nan, nan, nan, nan, nan], dtype='float64')

In [147]:
df2['score'].where(df2['name'] == 'Archer')

0     NaN
1     NaN
2     NaN
3     NaN
4     NaN
5     NaN
6     NaN
7     NaN
8    81.0
9     NaN
Name: score, dtype: float64

In [148]:
df2['score'].where(df2['name'] == 'Archer').dropna().values[0]

81.0

In [149]:
df2.index.where(df2['name'] == 'Archer').dropna().values[0]

8.0

In [150]:
df2.index.where(df2['score'] == 0).dropna().values[-1]  # <= key line!  .where is a winner again!

6.0

In [151]:
df2.index.where(df2['score'] == 0).dropna()

Float64Index([3.0, 4.0, 6.0], dtype='float64')

In [152]:
short_df = df2.index.where(df2['score'] == 0).dropna().values[:]
print([s for s in short_df])

[3.0, 4.0, 6.0]


Direct in-line change

In [153]:
df2['score'][2] = 68
df2.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,name,score,status,activity,address
0,Smith,49,full-time,high,3
1,Jones,65,part-time,medium,0
2,Able,68,full-time,medium,4
3,Cutler,0,part-time,medium,4
4,Williams,0,full-time,low,4


In [154]:
def score_helper(score):
    """
    finds close neighbour if score is zero and uses their score
    **apply to a column**
    https://cambridgespark.com/content/tutorials/quick-panda-tricks/index.html
    https://stackoverflow.com/questions/49738053/python-dataframe-finding-a-value-in-same-row-as-a-defined-value-in-a-different
    
    """
    if score == 0:
        pass
    else:
        # dummy key and value
        neighbour = {-1: -1}
        for j in range(df2.shape[0]):
            if df2['address'][j] == 3:
                pass
                    