# Pandas
Read "10 minutes to Pandas": https://pandas.pydata.org/docs/user_guide/10min.html before solving the exercises.
We will use the data set "cars_data" in the exercises below. 

In [60]:
# Importing Pandas. 
import pandas as pd
import numpy as np

### Explain what a CSV file is.

CSV files store plain text that can be used by Pandas etc. An efficient way to store big amounts of data!

### Load the data set "cars_data" through Pandas. 

In [61]:
# When reading in the data, either you have the data file in the same folder as your python script
# or in a seperate folder.

# Code below can be ran if you have the data file in the same folder as the script
# cars = pd.read_csv("cars_data.csv")

# Code below can be ran if you have the data file in another script. 
# Notice, you must change the path according to where you have the data in your computer. 
# pd.read_csv(r'C:\Users\Antonio Prgomet\Documents\ec_utbildning\kursframstallning\ds23\python_stat\exercises\numpy_matplot_pandas\cars_data.csv')

### Print the first 10 rows of the data. 

In [62]:
df = pd.read_csv("cars_data.csv")

### Print the last 5 rows. 

In [63]:
print(df.tail(5))

    index     company body-style  wheel-base  length engine-type num-of-cylinders  horsepower  average-mileage    price
56     81  volkswagen      sedan        97.3   171.7         ohc             four          85               27   7975.0
57     82  volkswagen      sedan        97.3   171.7         ohc             four          52               37   7995.0
58     86  volkswagen      sedan        97.3   171.7         ohc             four         100               26   9995.0
59     87       volvo      sedan       104.3   188.8         ohc             four         114               23  12940.0
60     88       volvo      wagon       104.3   188.8         ohc             four         114               23  13415.0


### By using the info method, check how many non-null rows each column have. 

In [64]:
df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61 entries, 0 to 60
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   index             61 non-null     int64  
 1   company           61 non-null     object 
 2   body-style        61 non-null     object 
 3   wheel-base        61 non-null     float64
 4   length            61 non-null     float64
 5   engine-type       61 non-null     object 
 6   num-of-cylinders  61 non-null     object 
 7   horsepower        61 non-null     int64  
 8   average-mileage   61 non-null     int64  
 9   price             58 non-null     float64
dtypes: float64(3), int64(3), object(4)
memory usage: 4.9+ KB


### If any column has a missing value, drop the entire row. Notice, the operation should be inplace meaning you change the dataframe itself.

In [65]:
df.dropna(inplace=True)
print(df)

    index        company   body-style  wheel-base  length engine-type num-of-cylinders  horsepower  average-mileage    price
0       0    alfa-romero  convertible        88.6   168.8        dohc             four         111               21  13495.0
1       1    alfa-romero  convertible        88.6   168.8        dohc             four         111               21  16500.0
2       2    alfa-romero    hatchback        94.5   171.2        ohcv              six         154               19  16500.0
3       3           audi        sedan        99.8   176.6         ohc             four         102               24  13950.0
4       4           audi        sedan        99.4   176.6         ohc             five         115               18  17450.0
5       5           audi        sedan        99.8   177.3         ohc             five         110               19  15250.0
6       6           audi        wagon       105.8   192.7         ohc             five         110               19  18920.0


### Calculate the mean of each numeric column. 

In [66]:
mean_column_value = df.mean(numeric_only=True)
print(mean_column_value)

index                 40.827586
wheel-base            98.620690
length               173.646552
horsepower           106.051724
average-mileage       25.534483
price              15387.000000
dtype: float64


### Select the rows where the column "company" is equal to 'honda'. 

In [67]:
honda_rows = df[df["company"] == "honda"]
print(honda_rows)

    index company body-style  wheel-base  length engine-type num-of-cylinders  horsepower  average-mileage    price
18     27   honda      wagon        96.5   157.1         ohc             four          76               30   7295.0
19     28   honda      sedan        96.5   175.4         ohc             four         101               24  12945.0
20     29   honda      sedan        96.5   169.1         ohc             four         100               25  10345.0


### Sort the data set by price in descending order. This should *not* be an inplace operation. 

In [68]:
price_sorted = df.sort_values(by="price", ascending=False)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_columns', None)
print(price_sorted)

    index        company   body-style  wheel-base  length engine-type num-of-cylinders  horsepower  average-mileage    price
35     47  mercedes-benz      hardtop       112.0   199.2        ohcv            eight         184               14  45400.0
11     14            bmw        sedan       103.5   193.8         ohc              six         182               16  41315.0
34     46  mercedes-benz        sedan       120.9   208.1        ohcv            eight         184               14  40960.0
46     62        porsche  convertible        89.5   168.9        ohcf              six         207               17  37028.0
12     15            bmw        sedan       110.0   197.0         ohc              six         182               15  36880.0
26     35         jaguar        sedan       102.0   191.7        ohcv           twelve         262               13  36000.0
25     34         jaguar        sedan       113.0   199.6        dohc              six         176               15  35550.0


### Select the rows where the column "company" is equal to any of the values (audi, bmw, porsche).

In [69]:
mult_rows = df[df["company"].isin(["audi", "bmw", "porsche"])]
print(mult_rows)

    index  company   body-style  wheel-base  length engine-type num-of-cylinders  horsepower  average-mileage    price
3       3     audi        sedan        99.8   176.6         ohc             four         102               24  13950.0
4       4     audi        sedan        99.4   176.6         ohc             five         115               18  17450.0
5       5     audi        sedan        99.8   177.3         ohc             five         110               19  15250.0
6       6     audi        wagon       105.8   192.7         ohc             five         110               19  18920.0
7       9      bmw        sedan       101.2   176.8         ohc             four         101               23  16430.0
8      10      bmw        sedan       101.2   176.8         ohc             four         101               23  16925.0
9      11      bmw        sedan       101.2   176.8         ohc              six         121               21  20970.0
10     13      bmw        sedan       103.5   18

### Find the number of cars (rows) for each company. 

In [70]:
car_count = df.groupby("company").size().reset_index(name="Number of cars")
print(car_count)

          company  Number of cars
0     alfa-romero               3
1            audi               4
2             bmw               6
3       chevrolet               3
4           dodge               2
5           honda               3
6           isuzu               1
7          jaguar               3
8           mazda               5
9   mercedes-benz               4
10     mitsubishi               4
11         nissan               5
12        porsche               2
13         toyota               7
14     volkswagen               4
15          volvo               2


### Find the maximum price for each company. 

In [71]:
max_price = df.groupby("company")["price"].max().reset_index(name="max price")
print(max_price)

          company  max price
0     alfa-romero    16500.0
1            audi    18920.0
2             bmw    41315.0
3       chevrolet     6575.0
4           dodge     6377.0
5           honda    12945.0
6           isuzu     6785.0
7          jaguar    36000.0
8           mazda    18344.0
9   mercedes-benz    45400.0
10     mitsubishi     8189.0
11         nissan    13499.0
12        porsche    37028.0
13         toyota    15750.0
14     volkswagen     9995.0
15          volvo    13415.0
