-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathexample01_open_csv.py
101 lines (68 loc) · 3.55 KB
/
example01_open_csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import pandas
# Let's start with some Python list comprehensions
# Like lapply() in R, list comprehensions allow for fast processing
# of items in a list without the need to use for loops
list_of_consecutive_num = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
transformed_num = [x ** 3 for x in list_of_consecutive_num]
# Adding * in this print statement prints without the brackets []
print(*transformed_num, '\n')
# This is the equivalent in for looping:
empty_list = [] # Initialize empty list
for x in list_of_consecutive_num:
x = x ** 3
empty_list.append(x)
print(*empty_list, '\n')
###########################################
# Let's try a more complicated structure
# Say you have a list of 'highest temp' recorded
# Ints within the same list are temperature measurements taken on consecutive days
nestedLists = [[88,90,82,80,91,92,90,98,90,82,83,84], [101,88,85,85,86,80], [87,88,88,90,82,81]]
print(nestedLists, '\n')
# Which is the largest list in nestedLists?
the_largest = max(nestedLists, key=len)
print(*the_largest, '\n')
# How many days were measured?
print('Temperatures were recorded across ' + str(len(the_largest)) + ' days' + '\n')
# What the hottest temp on this particular record?
print('The hottest temp recorded was ' + str(max(the_largest)) + '\n')
# What are the hottest 3 consecutive days?
# You can iterate via list comprehensions but I find for loops to make more sense
for little_list in nestedLists:
hottest = max(little_list)
ind = little_list.index(hottest)
# If hottest temp happened right at the start
if ind == 0:
print('The hottest consecutive temps are: ' + str(little_list[ind:ind+3]))
elif ind == (len(little_list)+1): # if index is the last place
print('The hottest consecutive temps are: ' + str(little_list[(ind-2):ind]))
# If index is not at last place but in middle
elif little_list[ind-2] > little_list[ind+1] and ind != (len(little_list)+1):
print('The hottest consecutive temps are: ' + str(little_list[(ind-2):(ind+1)]))
elif little_list[ind+1] > little_list[ind-2] and ind != (len(little_list)+1):
print('The hottest consecutive temps are: ' + str(little_list[(ind-1):(ind+2)]))
###########################################
'''Since pandas is such an important and ubiquitous package
our next exercise will cover doing simple tasks using pandas'''
# Is pandas installed?
# Get pandas version
print(pandas.__version__)
# Open CSV file within same dir
test_frame = pandas.read_csv('./df_for_tutorial.csv')
# If your file's headers are not in the first row,
# you can specify the number of rows you want to skip
# test_frame = pandas.read_excel("./sample_df_preprocessing.xlsx", skiprows=range(0,4))
# Output to terminal to see your headers
print(test_frame.head(), '\n')
# What are the column names?
print(test_frame.columns.tolist(), '\n')
# What are your unique values in the column "Compound_ID"?
print(test_frame['Compound_ID'].unique(), '\n')
# Does the Compound_ID 'COM01400' exist?
print(test_frame[test_frame['Compound_ID'].str.contains('COM01400')], '\n')
# The above line returns an empty dataframe, the ID 'COM01400' does not exist in test_frame
# What about this ID?
# This subsets the dataframe to just rows whose Compound_ID matches 'COMP09944'
print(test_frame[test_frame['Compound_ID'].str.contains('COMP09944')], '\n')
out_to_csv = test_frame[test_frame['Compound_ID'].str.contains('COMP09944')]
# Let's save a file with only entries whose Compound ID is 'COMP09944'
out_to_csv.to_csv('target_subset.csv', index=False)