# 🔴 Task 13: Data manipulation with Pandas (indexing, selection, grouping)

#### 1. Load a DataFrame from a CSV file. Display the first and last five rows of the DataFrame.

In [2]:
import numpy as np
import pandas as pd

In [3]:
import_dataFrame = pd.read_csv('flights.csv')
print("First 5 rows:")
print(import_dataFrame.head())
print()

print("Last 5 rows:")
print(import_dataFrame.tail())

First 5 rows:
   year     month  passengers
0  1949   January         112
1  1949  February         118
2  1949     March         132
3  1949     April         129
4  1949       May         121

Last 5 rows:
     year      month  passengers
139  1960     August         606
140  1960  September         508
141  1960    October         461
142  1960   November         390
143  1960   December         432


#### 2. Set a specific column as the index of the DataFrame.

In [5]:
dataFrame = import_dataFrame.set_index('month')
print(dataFrame)

           year  passengers
month                      
January    1949         112
February   1949         118
March      1949         132
April      1949         129
May        1949         121
...         ...         ...
August     1960         606
September  1960         508
October    1960         461
November   1960         390
December   1960         432

[144 rows x 2 columns]


#### 3. Select a specific column and display its values.

In [6]:
dataFrame2 = import_dataFrame.month
print(dataFrame2)

0        January
1       February
2          March
3          April
4            May
         ...    
139       August
140    September
141      October
142     November
143     December
Name: month, Length: 144, dtype: object


#### 4. Select multiple columns and display the resulting DataFrame.

In [7]:
dataFrame2 = import_dataFrame[['month', 'year', 'passengers']]
print(dataFrame2)

         month  year  passengers
0      January  1949         112
1     February  1949         118
2        March  1949         132
3        April  1949         129
4          May  1949         121
..         ...   ...         ...
139     August  1960         606
140  September  1960         508
141    October  1960         461
142   November  1960         390
143   December  1960         432

[144 rows x 3 columns]


#### 5. Select a subset of rows using the .loc method.

In [8]:
subset = import_dataFrame.loc[import_dataFrame['passengers'] > 120]
print(subset)

     year      month  passengers
2    1949      March         132
3    1949      April         129
4    1949        May         121
5    1949       June         135
6    1949       July         148
..    ...        ...         ...
139  1960     August         606
140  1960  September         508
141  1960    October         461
142  1960   November         390
143  1960   December         432

[137 rows x 3 columns]


#### 6. Select a subset of rows and columns using the .iloc method.

In [14]:
sub3 = import_dataFrame.iloc[:12, :2]
print(sub3)


    year      month
0   1949    January
1   1949   February
2   1949      March
3   1949      April
4   1949        May
5   1949       June
6   1949       July
7   1949     August
8   1949  September
9   1949    October
10  1949   November
11  1949   December


#### 7. Filter rows based on a condition.

In [15]:
f_df = import_dataFrame[(import_dataFrame['year'] > 1940) & (import_dataFrame['month'] != 'january')]
print(f_df)

     year      month  passengers
0    1949    January         112
1    1949   February         118
2    1949      March         132
3    1949      April         129
4    1949        May         121
..    ...        ...         ...
139  1960     August         606
140  1960  September         508
141  1960    October         461
142  1960   November         390
143  1960   December         432

[144 rows x 3 columns]


#### 8. Group the DataFrame by a specific column and calculate the mean of each group.

In [24]:
grouped = import_dataFrame.groupby('month')
grouped_mean = grouped.mean()
print(grouped_mean)

             year  passengers
month                        
April      1954.5  267.083333
August     1954.5  351.083333
December   1954.5  261.833333
February   1954.5  235.000000
January    1954.5  241.750000
July       1954.5  351.333333
June       1954.5  311.666667
March      1954.5  270.166667
May        1954.5  271.833333
November   1954.5  232.833333
October    1954.5  266.583333
September  1954.5  302.416667


#### 9. Group the DataFrame by multiple columns and calculate the sum of each group.

In [26]:
grouped = import_dataFrame.groupby(['year', 'month'])
grouped_sum = grouped.sum()
print(grouped_sum)

                passengers
year month                
1949 April             129
     August            148
     December          118
     February          118
     January           112
...                    ...
1960 March             419
     May               472
     November          390
     October           461
     September         508

[144 rows x 1 columns]


#### 10. Use the agg method to apply multiple aggregation functions to grouped data.

In [30]:
aggregate = grouped.agg({
    'passengers': ['sum', 'mean'],
    'year': ['min', 'max']
})
print(aggregate)

               passengers         year      
                      sum   mean   min   max
year month                                  
1949 April            129  129.0  1949  1949
     August           148  148.0  1949  1949
     December         118  118.0  1949  1949
     February         118  118.0  1949  1949
     January          112  112.0  1949  1949
...                   ...    ...   ...   ...
1960 March            419  419.0  1960  1960
     May              472  472.0  1960  1960
     November         390  390.0  1960  1960
     October          461  461.0  1960  1960
     September        508  508.0  1960  1960

[144 rows x 4 columns]


#### 11. Calculate the size of each group.

In [31]:
group_sizes = grouped.size()
print(group_sizes)

year  month    
1949  April        1
      August       1
      December     1
      February     1
      January      1
                  ..
1960  March        1
      May          1
      November     1
      October      1
      September    1
Length: 144, dtype: int64


#### 12. Select rows based on multiple conditions.

In [34]:
year = import_dataFrame['year'] > 1920 
passengers = import_dataFrame['passengers'] > 130
filter_dataFrame = import_dataFrame[year & passengers]

print(filter_dataFrame)


     year      month  passengers
2    1949      March         132
5    1949       June         135
6    1949       July         148
7    1949     August         148
8    1949  September         136
..    ...        ...         ...
139  1960     August         606
140  1960  September         508
141  1960    October         461
142  1960   November         390
143  1960   December         432

[133 rows x 3 columns]


#### 13. Use the query method to filter rows.

In [35]:
query = 'passengers > 120 and month in ["January", "August"]'
filter_df = import_dataFrame.query(query)
print(filter_df)


     year    month  passengers
7    1949   August         148
19   1950   August         170
24   1951  January         145
31   1951   August         199
36   1952  January         171
43   1952   August         242
48   1953  January         196
55   1953   August         272
60   1954  January         204
67   1954   August         293
72   1955  January         242
79   1955   August         347
84   1956  January         284
91   1956   August         405
96   1957  January         315
103  1957   August         467
108  1958  January         340
115  1958   August         505
120  1959  January         360
127  1959   August         559
132  1960  January         417
139  1960   August         606


#### 14. Use isin to filter rows based on a list of values.

In [36]:
filter1 = ['September', 'December']
filter_df = import_dataFrame[import_dataFrame['month'].isin(filter1)]
print(filter_df)


     year      month  passengers
8    1949  September         136
11   1949   December         118
20   1950  September         158
23   1950   December         140
32   1951  September         184
35   1951   December         166
44   1952  September         209
47   1952   December         194
56   1953  September         237
59   1953   December         201
68   1954  September         259
71   1954   December         229
80   1955  September         312
83   1955   December         278
92   1956  September         355
95   1956   December         306
104  1957  September         404
107  1957   December         336
116  1958  September         404
119  1958   December         337
128  1959  September         463
131  1959   December         405
140  1960  September         508
143  1960   December         432


#### 15. Select specific columns and rename them

In [39]:
select1 = import_dataFrame.loc[:, ['year', 'passengers']]
select1 = select1.rename(columns={'year': 'period', 'passengers': 'load'})
print(select1)


     period  load
0      1949   112
1      1949   118
2      1949   132
3      1949   129
4      1949   121
..      ...   ...
139    1960   606
140    1960   508
141    1960   461
142    1960   390
143    1960   432

[144 rows x 2 columns]
