### Pandas Series

In [1]:
import pandas as pd

In [2]:
students = ['Alice', 'Jack', 'Molly']

In [3]:
pd.Series(students)

0    Alice
1     Jack
2    Molly
dtype: object

In [4]:
numbers = [1,2,3]

In [5]:
pd.Series(numbers)

0    1
1    2
2    3
dtype: int64

In [6]:
students = ['Alice', 'Jack', None]
pd.Series(students)

0    Alice
1     Jack
2     None
dtype: object

In [7]:
numbers = [1,2,None]
pd.Series(numbers)

0    1.0
1    2.0
2    NaN
dtype: float64

In [9]:
None==None

True

In [14]:
type(numbers[2])

NoneType

In [15]:
type(numbers[1])

int

In [16]:
type(students[2])

NoneType

### NaN is not equivalent of None

In [17]:
import numpy as np

In [18]:
np.nan == None

False

In [19]:
np.nan == np.nan

False

In [20]:
np.isnan(np.nan)

True

In [21]:
student_scores = {'Alice':'Physics',
                  'Jack':'Chemistry',
                  'Molly': 'English'
}
s = pd.Series(student_scores)
s

Alice      Physics
Jack     Chemistry
Molly      English
dtype: object

In [22]:
s.index

Index(['Alice', 'Jack', 'Molly'], dtype='object')

In [23]:
s.values

array(['Physics', 'Chemistry', 'English'], dtype=object)

In [24]:
students = [('Alice','Brown'),('Jack','White'),('Molly','Green')]
pd.Series(students)

0    (Alice, Brown)
1     (Jack, White)
2    (Molly, Green)
dtype: object

In [25]:
s=pd.Series(['Physics','Chemistry','English'], index=['Alice','Jack','Molly'])
s

Alice      Physics
Jack     Chemistry
Molly      English
dtype: object

In [26]:
student_scores = {'Alice':'Physics',
                  'Jack':'Chemistry',
                  'Molly': 'English'
}
s = pd.Series(student_scores, index=['Alice','Molly','Sam'])
s

Alice    Physics
Molly    English
Sam          NaN
dtype: object

In [28]:
s.iloc[1]

'English'

In [29]:
s.loc['Alice']

'Physics'

In [30]:
s[1]

'English'

In [31]:
s['Molly']

'English'

In [32]:
class_code = {99:'Physics',
             100:'Chemistry',
             101:'English',
             102:'History'}
s = pd.Series(class_code)
s

99       Physics
100    Chemistry
101      English
102      History
dtype: object

In [33]:
s[99]

'Physics'

In [35]:
s.iloc[0]

'Physics'

In [36]:
grades = pd.Series([90, 80, 70, 60])
total = 0
for grade in grades:
    total += grade
total/len(grades)

75.0

In [37]:
grades.mean()

75.0

In [38]:
grades.sum()

300

In [39]:
np.sum(grades)

300

In [40]:
sum(grades)

300

In [41]:
numbers = pd.Series(np.random.randint(0, 1000, 10000))
numbers.head()

0     86
1    715
2    132
3    484
4    282
dtype: int32

In [42]:
len(numbers)

10000

In [44]:
%%timeit -n 100
total = 0 
for number in numbers:
    total += number
total/len(numbers)

1.66 ms ± 176 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [46]:
%%timeit -n 100
numbers.mean()

109 µs ± 14.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [47]:
numbers.head()

0     86
1    715
2    132
3    484
4    282
dtype: int32

In [48]:
numbers +=2
numbers.head()

0     88
1    717
2    134
3    486
4    284
dtype: int32

In [51]:
for label, value in numbers.iteritems():
    numbers[label] = value+2
numbers.head()

0     90
1    719
2    136
3    488
4    286
dtype: int32

In [50]:
numbers[0]

88

In [52]:
%%timeit -n 10
s = pd.Series(np.random.randint(0, 1000, 10000))
for label, value in s.iteritems():
    s.loc[label] = value+2

604 ms ± 49.7 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [53]:
%%timeit -n 10
s = pd.Series(np.random.randint(0, 1000, 10000))
s+=2

585 µs ± 182 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [54]:
s = pd.Series([1,2,3])
s.loc['History'] = 102
s

0            1
1            2
2            3
History    102
dtype: int64

In [55]:
class_code = {99:'Physics',
             100:'Chemistry',
             101:'English',
             102:'History'}
s = pd.Series(class_code)
s

99       Physics
100    Chemistry
101      English
102      History
dtype: object

In [59]:
student_scores = {'Alice':'Physics',
                  'Jack':'Chemistry',
                  'Molly': 'English'
}
s1 = pd.Series(student_scores, index=['Alice','Molly','Sam'])
s1

Alice    Physics
Molly    English
Sam          NaN
dtype: object

In [60]:
s2 = b.append(s)
s2

Alice      Physics
Molly      English
Sam            NaN
99         Physics
100      Chemistry
101        English
102        History
dtype: object

In [61]:
s

99       Physics
100    Chemistry
101      English
102      History
dtype: object

In [63]:
import pandas as pd
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj1 = pd.Series(sdata)
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj2 = pd.Series(sdata, index=states)
obj3 = pd.isnull(obj2)

In [64]:
x = obj2['California']
obj2['California'] != x

True

In [65]:
obj2['California']

nan

In [66]:
pd.isnull(obj2)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [67]:
obj3['California']

True

In [68]:
import math
math.isnan(obj2['California'])

True

In [69]:
obj2['California'] == None

False

In [114]:
df = pd.DataFrame(s1,columns=['lesson'])
df

Unnamed: 0,lesson
1,Alice
2,Jack
3,Molly


In [115]:
df.rename(mapper = lambda x: x.upper(),axis='columns')

Unnamed: 0,LESSON
1,Alice
2,Jack
3,Molly


In [118]:
df[df['lesson'].isin(['Alice','Jack'])]

Unnamed: 0,lesson
1,Alice
2,Jack


In [84]:
s1 = pd.Series({1: 'Alice', 2: 'Jack', 3: 'Molly'})
s2 = pd.Series({'Alice': 1, 'Jack': 2, 'Molly': 3})

In [85]:
s1.loc[1]

'Alice'

In [87]:
s2.iloc[1]

2

In [88]:
s2[1]

2

In [129]:
grades = pd.Series([90, 80, 70, 60])
df = pd.DataFrame(data=grades, columns=['Grade'],dtype='int')
df['names']=['John','Marry','Alice','George']
df.set_index('names',inplace=True)
df

Unnamed: 0_level_0,Grade
names,Unnamed: 1_level_1
John,90
Marry,80
Alice,70
George,60


In [130]:
df[(df['Grade'].isin(range(80,91)))]

Unnamed: 0_level_0,Grade
names,Unnamed: 1_level_1
John,90
Marry,80


In [135]:
df.where(df['Grade']>75)

Unnamed: 0_level_0,Grade
names,Unnamed: 1_level_1
John,90.0
Marry,80.0
Alice,
George,


In [140]:
df.T['John']

Grade    90
Name: John, dtype: int32

In [123]:
import pandas as pd
d = {'1': 'Alice','2': 'Bob','3': 'Rita','4': 'Molly','5': 'Ryan'}
S = pd.Series(d)

In [127]:
S.iloc[0:3]

1    Alice
2      Bob
3     Rita
dtype: object

## DataFrame

In [141]:
record1 = pd.Series({'Name':'Alice',
                    'Class':'Physics',
                    'Score':85})

record2 = pd.Series({'Name':'Jack',
                    'Class':'Chemistry',
                    'Score':82})

record3 = pd.Series({'Name':'Helen',
                    'Class':'Biology',
                    'Score':90})

In [142]:
df = pd.DataFrame([record1, record2,record3], index=['school1','school2','school3'])
df

Unnamed: 0,Name,Class,Score
school1,Alice,Physics,85
school2,Jack,Chemistry,82
school3,Helen,Biology,90


In [143]:
students = [{'Name':'Alice',
                    'Class':'Physics',
                    'Score':85},
            {'Name':'Jack',
                    'Class':'Chemistry',
                    'Score':82},
            {'Name':'Helen',
                    'Class':'Biology',
                    'Score':90}]

In [144]:
students

[{'Name': 'Alice', 'Class': 'Physics', 'Score': 85},
 {'Name': 'Jack', 'Class': 'Chemistry', 'Score': 82},
 {'Name': 'Helen', 'Class': 'Biology', 'Score': 90}]

In [148]:
df = pd.DataFrame(students, index=['school1','school2','school1'])
df

Unnamed: 0,Name,Class,Score
school1,Alice,Physics,85
school2,Jack,Chemistry,82
school1,Helen,Biology,90


In [149]:
df.loc['school2']

Name          Jack
Class    Chemistry
Score           82
Name: school2, dtype: object

In [150]:
type(df.loc['school2'])

pandas.core.series.Series

In [151]:
df.loc['school1']

Unnamed: 0,Name,Class,Score
school1,Alice,Physics,85
school1,Helen,Biology,90


In [152]:
type(df.loc['school1'])

pandas.core.frame.DataFrame

In [153]:
df.loc['school1','Name']

school1    Alice
school1    Helen
Name: Name, dtype: object

In [154]:
df.T

Unnamed: 0,school1,school2,school1.1
Name,Alice,Jack,Helen
Class,Physics,Chemistry,Biology
Score,85,82,90


In [156]:
df.T.loc['Name']

school1    Alice
school2     Jack
school1    Helen
Name: Name, dtype: object

In [157]:
df['Name']

school1    Alice
school2     Jack
school1    Helen
Name: Name, dtype: object

In [158]:
type(df['Name'])

pandas.core.series.Series

In [159]:
df.loc['school1']['Name']

school1    Alice
school1    Helen
Name: Name, dtype: object

In [160]:
df.loc[:,['Name','Score']]

Unnamed: 0,Name,Score
school1,Alice,85
school2,Jack,82
school1,Helen,90


In [161]:
copy_df = df.copy()
copy_df

Unnamed: 0,Name,Class,Score
school1,Alice,Physics,85
school2,Jack,Chemistry,82
school1,Helen,Biology,90


In [162]:
del copy_df['Class']
copy_df

Unnamed: 0,Name,Score
school1,Alice,85
school2,Jack,82
school1,Helen,90


In [8]:
df = pd.read_csv('log.csv')
df.head()

Unnamed: 0,time,user,video,playback position,paused,volume
0,1469974424,cheryl,intro.html,5,False,10.0
1,1469974454,cheryl,intro.html,6,,
2,1469974544,cheryl,intro.html,9,,
3,1469974574,cheryl,intro.html,10,,
4,1469977514,bob,intro.html,1,,


In [10]:
df.set_index('time', inplace=True)


KeyError: "None of ['time'] are in the columns"

In [12]:
df.sort_index(inplace=True)

In [13]:
df.head()

Unnamed: 0_level_0,user,video,playback position,paused,volume
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1469974424,cheryl,intro.html,5,False,10.0
1469974424,sue,advanced.html,23,False,10.0
1469974454,cheryl,intro.html,6,,
1469974454,sue,advanced.html,24,,
1469974484,cheryl,intro.html,7,,


In [16]:
df.reset_index(inplace=True)
df.head()

Unnamed: 0,time,user,video,playback position,paused,volume
0,1469974424,cheryl,intro.html,5,False,10.0
1,1469974424,sue,advanced.html,23,False,10.0
2,1469974454,cheryl,intro.html,6,,
3,1469974454,sue,advanced.html,24,,
4,1469974484,cheryl,intro.html,7,,


In [17]:
df.set_index(['time','user'], inplace=True)
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,video,playback position,paused,volume
time,user,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1469974424,cheryl,intro.html,5,False,10.0
1469974424,sue,advanced.html,23,False,10.0
1469974454,cheryl,intro.html,6,,
1469974454,sue,advanced.html,24,,
1469974484,cheryl,intro.html,7,,


In [19]:
df = pd.read_csv('log.csv')
df

Unnamed: 0,time,user,video,playback position,paused,volume
0,1469974424,cheryl,intro.html,5,False,10.0
1,1469974454,cheryl,intro.html,6,,
2,1469974544,cheryl,intro.html,9,,
3,1469974574,cheryl,intro.html,10,,
4,1469977514,bob,intro.html,1,,
5,1469977544,bob,intro.html,1,,
6,1469977574,bob,intro.html,1,,
7,1469977604,bob,intro.html,1,,
8,1469974604,cheryl,intro.html,11,,
9,1469974694,cheryl,intro.html,14,,


In [31]:
df.replace('.*.html','webpage',regex=True)

Unnamed: 0,time,user,video,playback position,paused,volume
0,1469974424,cheryl,webpage,5,False,10.0
1,1469974454,cheryl,webpage,6,,
2,1469974544,cheryl,webpage,9,,
3,1469974574,cheryl,webpage,10,,
4,1469977514,bob,webpage,1,,
5,1469977544,bob,webpage,1,,
6,1469977574,bob,webpage,1,,
7,1469977604,bob,webpage,1,,
8,1469974604,cheryl,webpage,11,,
9,1469974694,cheryl,webpage,14,,


In [29]:
df.replace(to_replace='.*.html$', value='webpage', regex=True)

Unnamed: 0,time,user,video,playback position,paused,volume
0,1469974424,cheryl,webpage,5,False,10.0
1,1469974454,cheryl,webpage,6,,
2,1469974544,cheryl,webpage,9,,
3,1469974574,cheryl,webpage,10,,
4,1469977514,bob,webpage,1,,
5,1469977544,bob,webpage,1,,
6,1469977574,bob,webpage,1,,
7,1469977604,bob,webpage,1,,
8,1469974604,cheryl,webpage,11,,
9,1469974694,cheryl,webpage,14,,
